diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml index e91f2c55..88a49dea 100644 --- a/.github/workflows/build-docs.yml +++ b/.github/workflows/build-docs.yml @@ -28,5 +28,8 @@ jobs: run: doxygen - name: Upload documentation - working-directory: ${{ github.workspace }}/docs/html - run: tar -czf - * | sshpass -p "${{ secrets.EMSCRIPTEN_DEPLOY_PASSWORD }}" ssh -o StrictHostKeyChecking=no github@pallas.ti.bfh.ch "rm -rf docs && mkdir docs && cd docs && tar xzvf -" + working-directory: ${{ github.workspace }}/docs + run: | + sshpass -p "${{ secrets.EMSCRIPTEN_DEPLOY_PASSWORD }}" sftp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${{ secrets.EMSCRIPTEN_DEPLOY_REMOTE }}" << EOF + put -r html public/www/docs + EOF diff --git a/.github/workflows/build-wasm-emscripten.yml b/.github/workflows/build-wasm-emscripten.yml index f95eff74..60a53add 100644 --- a/.github/workflows/build-wasm-emscripten.yml +++ b/.github/workflows/build-wasm-emscripten.yml @@ -16,10 +16,10 @@ jobs: - name: Install Emscripten uses: mymindstorm/setup-emsdk@v11 with: - version: 3.1.46 + version: 3.1.60 - name: Configure CMake run: emcmake cmake -B ${{ github.workspace }}/build -DSL_BUILD_WAI=OFF -DSL_BUILD_WITH_OPENSSL=OFF -DSL_DOWNLOAD_DATA=OFF - name: Build - run: cmake --build ${{ github.workspace }}/build --target app-Demo-SLProject -j $(nproc) + run: cmake --build ${{ github.workspace }}/build --target app-demo -j $(nproc) diff --git a/.github/workflows/deploy-pages.yml b/.github/workflows/deploy-pages.yml index 435a873b..792f7811 100644 --- a/.github/workflows/deploy-pages.yml +++ b/.github/workflows/deploy-pages.yml @@ -13,12 +13,12 @@ concurrency: cancel-in-progress: false jobs: - deploy: + build-docs: runs-on: ubuntu-22.04 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Doxygen run: sudo apt install -y doxygen @@ -34,13 +34,25 @@ jobs: working-directory: ${{ github.workspace }}/docs run: doxygen - - name: Setup GitHub Pages - uses: actions/configure-pages@v5 + - name: Copy Images + run: cp -r ${{ github.workspace }}/docs/images ${{ github.workspace }}/docs/html - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: - path: "${{ github.workspace }}/docs/html" + path: ${{ github.workspace }}/docs/html + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + needs: build-docs + runs-on: ubuntu-22.04 + + steps: + - name: Setup GitHub Pages + uses: actions/configure-pages@v5 - name: Deploy to GitHub Pages id: deployment diff --git a/.github/workflows/deploy-wasm-emscripten.yml b/.github/workflows/deploy-wasm-emscripten.yml index cfc4b850..84153a5d 100644 --- a/.github/workflows/deploy-wasm-emscripten.yml +++ b/.github/workflows/deploy-wasm-emscripten.yml @@ -16,14 +16,28 @@ jobs: - name: Install Emscripten uses: mymindstorm/setup-emsdk@v11 with: - version: 3.1.46 + version: 3.1.60 - name: Configure CMake - run: emcmake cmake -B ${{ github.workspace }}/build -DSL_BUILD_WAI=OFF -DSL_BUILD_WITH_OPENSSL=OFF -DSL_DOWNLOAD_DATA=OFF -DCMAKE_BUILD_TYPE=Release + run: | + emcmake cmake \ + -B ${{ github.workspace }}/build \ + -DSL_BUILD_WAI=OFF \ + -DSL_BUILD_WITH_OPENSSL=OFF \ + -DSL_BUILD_WITH_MEDIAPIPE=OFF \ + -DSL_DOWNLOAD_DATA=OFF \ + -DCMAKE_BUILD_TYPE=Release - name: Build - run: cmake --build ${{ github.workspace }}/build --target app-Demo-SLProject -j $(nproc) + run: cmake --build ${{ github.workspace }}/build --target app-demo -j $(nproc) - name: Upload working-directory: ${{ github.workspace }}/build - run: sshpass -p "${{ secrets.EMSCRIPTEN_DEPLOY_PASSWORD }}" scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null app-Demo-SLProject.* github@pallas.ti.bfh.ch:. + run: | + sshpass -p "${{ secrets.EMSCRIPTEN_DEPLOY_PASSWORD }}" sftp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${{ secrets.EMSCRIPTEN_DEPLOY_REMOTE }}" << EOF + put app-demo.html public/www/app-demo.html + put app-demo.js public/www/app-demo.js + put app-demo.wasm public/www/app-demo.wasm + put app-demo.worker.js public/www/app-demo.worker.js + put app-demo.ww.js public/www/app-demo.ww.js + EOF diff --git a/.gitignore b/.gitignore index f9deb4cb..c07b604a 100644 --- a/.gitignore +++ b/.gitignore @@ -92,4 +92,5 @@ Tester /externals/prebuild_scripts/eigen /externals/prebuild_scripts/openssl -docs/html \ No newline at end of file +docs/html +.clangd \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a00eba4..a3a566fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,21 @@ endif () include(cmake/SetGitBranchNameAndCommitID.cmake) include(cmake/CompileOptions.cmake) +# Determine the platform that apps will run on. +# On desktop operating systems, the platform is GLFW. +# On Android, iOS, and Emscripten, the platform name is just directly the system name. +if(SYSTEM_NAME_UPPER MATCHES "^(WINDOWS|DARWIN|LINUX)$") + set(SL_PLATFORM "GLFW") +elseif(SYSTEM_NAME_UPPER STREQUAL "ANDROID") + set(SL_PLATFORM "ANDROID") +elseif(SYSTEM_NAME_UPPER STREQUAL "IOS") + set(SL_PLATFORM "IOS") +elseif(SYSTEM_NAME_UPPER STREQUAL "EMSCRIPTEN") + set(SL_PLATFORM "EMSCRIPTEN") +else() + set(SL_PLATFORM "UNKNOWN") +endif() + option(SL_DOWNLOAD_DATA "Specifies if the data ZIP file should be downloaded" ON) option(SL_DOWNLOAD_PREBUILTS "Specifies if prebuilt libraries should be downloaded" ON) option(SL_BUILD_WAI "Specifies if the WAI library should be built" ON) @@ -54,18 +69,22 @@ option(SL_BUILD_WITH_OPTIX "Specifies if Optix renderer should be built" OFF) option(SL_BUILD_WITH_KTX "Specifies if Kronos Texture library (ktx) should be used" ON) option(SL_BUILD_WITH_OPENSSL "Specifies if OpenSSL should be used" ON) option(SL_BUILD_WITH_ASSIMP "Specifies if Assimp should be used" ON) - -if ("${SYSTEM_NAME_UPPER}" MATCHES "IOS" OR "${SYSTEM_NAME_UPPER}" MATCHES "EMSCRIPTEN") - option(SL_BUILD_WITH_MEDIAPIPE "Specifies if MediaPipe should be used" OFF) -else () - option(SL_BUILD_WITH_MEDIAPIPE "Specifies if MediaPipe should be used" ON) -endif () +option(SL_BUILD_WITH_MEDIAPIPE "Specifies if MediaPipe should be used" ON) option(LIBIGL_USE_STATIC_LIBRARY "Specifies if LibIGL should be built statically" ON) +if(SYSTEM_NAME_UPPER STREQUAL "IOS") + set(SL_BUILD_WITH_MEDIAPIPE OFF) +elseif(SYSTEM_NAME_UPPER STREQUAL "EMSCRIPTEN") + set(SL_BUILD_WAI OFF) + set(SL_BUILD_WITH_OPENSSL OFF) + set(SL_BUILD_WITH_MEDIAPIPE OFF) +endif() + message(STATUS "----------------------------------------------------------------") message(STATUS "System: ${SYSTEM_NAME_UPPER}") message(STATUS "Architecture: ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS "Platform: ${SL_PLATFORM}") message(STATUS "----------------------------------------------------------------") message(STATUS "SL_DOWNLOAD_PREBUILTS: ${SL_DOWNLOAD_PREBUILTS}") message(STATUS "SL_BUILD_WAI: ${SL_BUILD_WAI}") @@ -83,16 +102,15 @@ message(STATUS "---------------------------------------------------------------- if (SL_DOWNLOAD_DATA) include(cmake/DownloadData.cmake) + message(STATUS "----------------------------------------------------------------") endif () -message(STATUS "----------------------------------------------------------------") - if (SL_DOWNLOAD_PREBUILTS) include(cmake/DownloadPrebuilts.cmake) + message(STATUS "Finished downloading prebuilts") + message(STATUS "----------------------------------------------------------------") endif () -message(STATUS "----------------------------------------------------------------") - if ("${CMAKE_SYSTEM_NAME}" MATCHES "Android") message(STATUS "SL_APP: ${SL_APP}") option(SL_APP diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 2bcefcfc..0119ae2f 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -1,16 +1,378 @@ -if("${SYSTEM_NAME_UPPER}" MATCHES "ANDROID") - add_subdirectory(app_demo_slproject) -elseif("${SYSTEM_NAME_UPPER}" MATCHES "IOS") - add_subdirectory(app_demo_slproject) -else() +function(sl_add_app) + # ------------------------------------------------------------------------- + # Parse CMake function arguments. + # ------------------------------------------------------------------------- + + set(oneValueArgs + TARGET + ANDROID_APP_DIR + IOS_INFO_PLIST + IOS_DISPLAY_NAME + IOS_COPYRIGHT + IOS_ICON_NAME + ) + + set(multiValueArgs + PLATFORMS + HEADERS SOURCES + INCLUDE_DIRECTORIES + COMPILE_DEFINITIONS + IOS_RESOURCES + ) + + cmake_parse_arguments(APP "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") + + # ------------------------------------------------------------------------- + # Print app information and check platform support. + # ------------------------------------------------------------------------- + + message(STATUS "Adding app: ${APP_TARGET}") + list(JOIN APP_PLATFORMS ", " platformsString) + message(STATUS " Supported platforms: ${platformsString}") + + if(NOT SL_PLATFORM IN_LIST APP_PLATFORMS) + message(STATUS " Disabled on this platform") + return() + endif() + + # ------------------------------------------------------------------------- + # Define headers and sources shared by all platforms. + # This includes the headers and sources for the current app. + # ------------------------------------------------------------------------- + + file(GLOB COMMON_HEADERS + ${SL_PROJECT_ROOT}/apps/source/App.h + ${SL_PROJECT_ROOT}/apps/source/CVCapture.h + ${SL_PROJECT_ROOT}/apps/source/AppCommon.h + ${SL_PROJECT_ROOT}/apps/source/SLScene.h + ${SL_PROJECT_ROOT}/apps/source/SLInterface.h + ${SL_PROJECT_ROOT}/apps/source/Scene.h + ${APP_HEADERS} + ) + + file(GLOB COMMON_SOURCES + ${SL_PROJECT_ROOT}/apps/source/CVCapture.cpp + ${SL_PROJECT_ROOT}/apps/source/AppCommon.cpp + ${SL_PROJECT_ROOT}/apps/source/SLInterface.cpp + ${SL_PROJECT_ROOT}/apps/source/SLProjectScene.cpp + ${APP_SOURCES} + ) + + # ------------------------------------------------------------------------- + # Create the target and configure platform-specific things like libraries + # to link or assets to copy on mobile platforms. + # ------------------------------------------------------------------------- + + if(SL_PLATFORM STREQUAL "GLFW") + # --------------------------------------------------------------------- + # Configuration for platform GLFW + # --------------------------------------------------------------------- + + file(GLOB HEADERS + ${COMMON_HEADERS} + ) + + file(GLOB SOURCES + ${COMMON_SOURCES} + ${SL_PROJECT_ROOT}/apps/source/platforms/glfw/AppGLFW.cpp + ) + + add_executable( + ${APP_TARGET} + ${HEADERS} + ${SOURCES} + ) + + # Group source files for IDEs + source_group_by_path("${CMAKE_CURRENT_SOURCE_DIR}" "\\\\.h$|\\\\.hpp$" "Header Files" ${HEADERS}) + source_group_by_path("${CMAKE_CURRENT_SOURCE_DIR}" "\\\\.cpp$|\\\\.c$|\\\\.h$|\\\\.hpp$" "Source Files" ${SOURCES}) + + set_target_properties( + ${APP_TARGET} + PROPERTIES + ${DEFAULT_PROJECT_OPTIONS} + FOLDER "apps" + ) + + target_include_directories( + ${APP_TARGET} + PRIVATE + ${SL_PROJECT_ROOT}/apps/source/platforms/glfw + ) + + target_link_libraries( + ${APP_TARGET} + PRIVATE + ${glfw_LIBS} + ) + elseif(SL_PLATFORM STREQUAL "EMSCRIPTEN") + # --------------------------------------------------------------------- + # Configuration for platform Emscripten + # --------------------------------------------------------------------- + + file(GLOB HEADERS + ${COMMON_HEADERS} + ${SL_PROJECT_ROOT}/apps/source/WebCamera.h + ) + + file(GLOB SOURCES + ${COMMON_SOURCES} + ${SL_PROJECT_ROOT}/apps/source/WebCamera.cpp + ${SL_PROJECT_ROOT}/apps/source/platforms/emscripten/AppEmscripten.cpp + ) + + add_executable( + ${APP_TARGET} + ${HEADERS} + ${SOURCES} + ) + + target_include_directories( + ${APP_TARGET} + PRIVATE + ${SL_PROJECT_ROOT}/apps/source/platforms/emscripten + ) + + # Copy the HTML page and the server script to the target directory and replace + # the string "TARGET" in them with the name of the target. + # If you change these files, you have to re-run CMake. + + file(READ "${SL_PROJECT_ROOT}/apps/source/platforms/emscripten/index.html" INDEX_SOURCE) + string(REPLACE "TARGET" "${APP_TARGET}" INDEX_SOURCE "${INDEX_SOURCE}") + file(WRITE "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${APP_TARGET}.html" "${INDEX_SOURCE}") + + file(READ "${SL_PROJECT_ROOT}/apps/source/platforms/emscripten/server.py" SERVER_SOURCE) + string(REPLACE "TARGET" "${APP_TARGET}" SERVER_SOURCE "${SERVER_SOURCE}") + file(WRITE "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/serve-${APP_TARGET}.py" "${SERVER_SOURCE}") + elseif(SL_PLATFORM STREQUAL "ANDROID") + # --------------------------------------------------------------------- + # Configuration for platform Android + # For more information about using CMake with Android Studio, read the + # documentation: https://d.android.com/studio/projects/add-native-code.html + # --------------------------------------------------------------------- + + file(GLOB HEADERS + ${COMMON_HEADERS} + ) + + file(GLOB SOURCES + ${COMMON_SOURCES} + ${SL_PROJECT_ROOT}/apps/source/platforms/android/AppAndroid.cpp + ) + + # Copy APK contents. + include(CopyResourcesAppDemoSLProject) + copy_resources_slprojectdemo("${APP_ANDROID_APP_DIR}/src/main/assets/data") + + file(GLOB_RECURSE + MEDIAPIPE_ASSETS + ${SL_PROJECT_ROOT}/data/mediapipe/*.tflite + ${SL_PROJECT_ROOT}/data/mediapipe/*.txt + ) + + foreach (ASSET_PATH ${MEDIAPIPE_ASSETS}) + get_filename_component(ASSET_FILENAME "${ASSET_PATH}" NAME) + file(COPY "${ASSET_PATH}" DESTINATION "${APP_ANDROID_APP_DIR}/src/main/assets") + endforeach () + + add_library( + ${APP_TARGET} + SHARED + ${HEADERS} + ${SOURCES} + ) + + target_link_libraries( + ${APP_TARGET} + PRIVATE + libc++_shared.so + z + ) + elseif(SL_PLATFORM STREQUAL "IOS") + # --------------------------------------------------------------------- + # Configuration for platform iOS + # --------------------------------------------------------------------- + + file(GLOB HEADERS + ${COMMON_HEADERS} + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/AppDelegate.h + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/Utils_iOS.h + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/ViewController.h + ) + + file(GLOB SOURCES + ${COMMON_SOURCES} + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/AppDelegate.mm + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/ViewController.mm + ${SL_PROJECT_ROOT}/apps/source/platforms/ios/AppIOS.mm + ) + + # Copy defined resources to build directory and add get folder + # reference using file(GLOB) (this is a secret trick that nobody knows). + + include(CopyResourcesAppDemoSLProject) + set(DATA_DIR "${CMAKE_BINARY_DIR}/data") + copy_resources_slprojectdemo("${DATA_DIR}") + file(GLOB DATA "${DATA_DIR}") + + set(RESOURCES + ${APP_IOS_RESOURCES} + ${DATA} + ) + + # Group resource files in Xcode's `Resources` directory. + source_group(Resources FILES ${RESOURCES}) + + add_executable( + ${APP_TARGET} + MACOSX_BUNDLE + ${HEADERS} + ${SOURCES} + ${RESOURCES} + ) + + set(BUNDLE_IDENTIFIER "ch.bfh.ti.cpvrlab.${APP_TARGET}") + set(XCODE_CODESIGNIDENTITY "iPhone Developer") + set(XCODE_DEVELOPMENTTEAM ${XCODE_DEVELOPMENTTEAM}) + + message(STATUS " iOS bundle identifier: ${BUNDLE_IDENTIFIER}") + + # Use the "new" Xcode build system. + set(CMAKE_XCODE_BUILD_SYSTEM "12") + + # Target both iPhone (1) and iPad (2) + set(DEVICE_FAMILY "1,2") + + # Configure the Info.plist file used by Xcode. + # + # This takes the Info.plist template specified by the app, replaces variables starting + # with `MACOSX_BUNDLE` with their values, and writes the result to a temporary file. + # The temporary file is specified as the app's Info.plist in the `set_target_properties` call below. + # + # We don't use the variables in https://cmake.org/cmake/help/v3.17/prop_tgt/MACOSX_BUNDLE_INFO_PLIST.html + # because the native Info.plist variable functionality from CMake doesn't support all the variables we want to set. + + set(MACOSX_BUNDLE_DISPLAY_NAME "${APP_IOS_DISPLAY_NAME}") + set(MACOSX_BUNDLE_EXECUTABLE_NAME "${APP_TARGET}") + set(MACOSX_BUNDLE_INFO_STRING "${BUNDLE_IDENTIFIER}") + set(MACOSX_BUNDLE_GUI_IDENTIFIER "${BUNDLE_IDENTIFIER}") + set(MACOSX_BUNDLE_BUNDLE_NAME "${BUNDLE_IDENTIFIER}") + set(MACOSX_BUNDLE_LONG_VERSION_STRING "1.0") + set(MACOSX_BUNDLE_SHORT_VERSION_STRING "1.0") + set(MACOSX_BUNDLE_BUNDLE_VERSION "1.0") + set(MACOSX_BUNDLE_COPYRIGHT "${APP_IOS_COPYRIGHT}") + set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/${APP_TARGET}.Info.plist") + configure_file("${APP_IOS_INFO_PLIST}" "${INFO_PLIST_PATH}") + + set_target_properties( + ${APP_TARGET} + PROPERTIES + ${DEFAULT_PROJECT_OPTIONS} + FOLDER "apps" + + MACOSX_BUNDLE_INFO_PLIST "${INFO_PLIST_PATH}" + XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym" + XCODE_ATTRIBUTE_GCC_PREFIX_HEADER "${SL_PROJECT_ROOT}/apps/source/platforms/ios/Prefix.pch" #this is a precompiled header! + RESOURCE "${RESOURCES}" + XCODE_ATTRIBUTE_GCC_PRECOMPILE_PREFIX_HEADER "YES" + # XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET} + XCODE_ATTRIBUTE_TARGETED_DEVICE_FAMILY ${DEVICE_FAMILY} + XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES + XCODE_ATTRIBUTE_COMBINE_HIDPI_IMAGES NO + XCODE_ATTRIBUTE_INSTALL_PATH "$(LOCAL_APPS_DIR)" + XCODE_ATTRIBUTE_ENABLE_TESTABILITY NO + XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN YES + XCODE_ATTRIBUTE_ASSETCATALOG_COMPILER_APPICON_NAME "${APP_IOS_ICON_NAME}" #defines icon name in asset catalog (images.xcassets) + XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "${BUNDLE_IDENTIFIER}" + # apple requires storyboard launchscreens now so the following will not come into store: + # XCODE_ATTRIBUTE_ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME "LaunchImage" #defines launch screen image name in asset catalog (images.xcassets) + + XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY ${XCODE_CODESIGNIDENTITY} + XCODE_ATTRIBUTE_DEVELOPMENT_TEAM ${XCODE_DEVELOPMENTTEAM} + ) + + target_include_directories( + ${APP_TARGET} + PRIVATE + ${SL_PROJECT_ROOT}/apps/source/platforms/ios + ) + + # Find system frameworks (libraries). + find_library(AVFOUNDATION AVFoundation required) + find_library(COREGRAPHICS CoreGraphics required) + find_library(COREVIDEO CoreVideo required) + find_library(COREMOTION CoreMotion required) + find_library(COREMEDIA CoreMedia required) + find_library(UIKIT UIKit required) + find_library(OPENGLES OpenGLES required) + find_library(GLKIT GLKit required) + find_library(CORELOCATION CoreLocation required) + + target_link_libraries( + ${APP_TARGET} + PRIVATE + ${AVFOUNDATION} + ${COREGRAPHICS} + ${COREVIDEO} + ${COREMOTION} + ${COREMEDIA} + ${UIKIT} + ${OPENGLES} + ${GLKIT} + ${CORELOCATION} + ) + endif() + + # ------------------------------------------------------------------------- + # Set target configuration shared by all platforms. + # ------------------------------------------------------------------------- + + enable_warnings(${APP_TARGET}) + + target_include_directories( + ${APP_TARGET} + PRIVATE + ${SL_PROJECT_ROOT}/apps/source + ${APP_INCLUDE_DIRECTORIES} + ) + + target_compile_definitions( + ${APP_TARGET} + PRIVATE + ${DEFAULT_COMPILE_DEFINITIONS} + ${APP_COMPILE_DEFINITIONS} + ) + + target_compile_options( + ${APP_TARGET} + PRIVATE + ${DEFAULT_COMPILE_OPTIONS} + ) + + target_link_libraries( + ${APP_TARGET} + PRIVATE + ${DEFAULT_LINKER_OPTIONS} + ${PlatformLinkLibs} + sl + sl_external + sl_utils + ${OpenCV_LIBS} + ) +endfunction() + +add_subdirectory(app_demo) +add_subdirectory(app_node) +add_subdirectory(app_minimal) + +if(NOT SYSTEM_NAME_UPPER MATCHES "^(ANDROID|IOS)$") if (SL_BUILD_EXERCISES) add_subdirectory(exercises) endif() - add_subdirectory(app_demo_imgui) - add_subdirectory(app_demo_node) - add_subdirectory(app_demo_slproject) + + add_subdirectory(app_imgui) if (SL_BUILD_WEBGPU_DEMO AND NOT ("${SYSTEM_NAME_UPPER}" MATCHES "EMSCRIPTEN")) - add_subdirectory(app_demo_webgpu) + add_subdirectory(app_webgpu) endif () -endif() \ No newline at end of file +endif() diff --git a/apps/app_demo_slproject/Benchmarks.ods b/apps/app_demo/Benchmarks.ods similarity index 100% rename from apps/app_demo_slproject/Benchmarks.ods rename to apps/app_demo/Benchmarks.ods diff --git a/apps/app_demo/CMakeLists.txt b/apps/app_demo/CMakeLists.txt new file mode 100644 index 00000000..991d4ee1 --- /dev/null +++ b/apps/app_demo/CMakeLists.txt @@ -0,0 +1,41 @@ +sl_add_app( + TARGET "app-demo" + + PLATFORMS + "GLFW" + "EMSCRIPTEN" + "ANDROID" + "IOS" + + COMPILE_DEFINITIONS + "SL_STARTSCENE=SID_ShaderIBL" + + HEADERS + "source/*.h" + "source/scenes/*.h" + + SOURCES + "source/*.cpp" + "source/scenes/*.cpp" + + INCLUDE_DIRECTORIES + "source/" + "source/scenes/" + + # On Android, set the path to the `app` directory + ANDROID_APP_DIR "${SL_PROJECT_ROOT}/apps/source/platforms/android/example_project/app" + + # On iOS, set the path to the Info.plist template as well as some other properties. + IOS_INFO_PLIST "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/plist.in" + IOS_DISPLAY_NAME "app-demo" + IOS_COPYRIGHT "Copyright Berner Fachhochschule (Marcus Hudritsch)" + IOS_ICON_NAME "AppIcon" + + # Define additional Xcode app resources for iOS. + IOS_RESOURCES + "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/Base.lproj/ViewController_iPad.xib" + "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/Base.lproj/ViewController_iPhone.xib" + "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/LaunchScreenSLDemo.storyboard" + "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/Images/Images.xcassets" + "${SL_PROJECT_ROOT}/apps/source/platforms/ios/example_project/Images/LaunchImage_1024x768.png" +) diff --git a/apps/app_demo_slproject/glfw/imgui.ini b/apps/app_demo/imgui.ini similarity index 100% rename from apps/app_demo_slproject/glfw/imgui.ini rename to apps/app_demo/imgui.ini diff --git a/apps/app_demo_slproject/source/AppDemoGui.cpp b/apps/app_demo/source/AppDemoGui.cpp similarity index 75% rename from apps/app_demo_slproject/source/AppDemoGui.cpp rename to apps/app_demo/source/AppDemoGui.cpp index 8f21f9a3..06a95157 100644 --- a/apps/app_demo_slproject/source/AppDemoGui.cpp +++ b/apps/app_demo/source/AppDemoGui.cpp @@ -1,17 +1,19 @@ -//############################################################################# -// File: AppDemoGui.cpp -// Purpose: UI with the ImGUI framework fully rendered in OpenGL 3+ -// Date: Summer 2017 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# +/** + * \file AppDemoGui.cpp + * \brief UI with the ImGUI framework fully rendered in OpenGL 3+ + * \date Summer 2017 + * \authors Marcus Hudritsch + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ #include -#include +#include +#include +#include +#include #include - #include #include #include @@ -35,13 +37,14 @@ #include #include #include -#include +#include #include #include #include -#include #include -#include + +#define IMGUI_DEFINE_MATH_OPERATORS +#include #ifndef SL_EMSCRIPTEN # include @@ -56,14 +59,15 @@ #endif //----------------------------------------------------------------------------- -extern CVTracked* tracker; // Global pointer declared in AppDemoTracking -extern SLNode* trackedNode; // Global pointer declared in AppDemoTracking -extern SLGLTexture* gTexMRI3D; // Global pointer declared in AppDemoLoad -extern SLNode* gDragonModel; // Global pointer declared in AppDemoLoad +extern CVTracked* gVideoTracker; // Global pointer declared in AppDemoTracking +extern SLNode* gVideoTrackedNode; // Global pointer declared in AppDemoTracking +extern SLGLTexture* gTexMRI3D; // Global pointer declared in AppDemoLoad +extern SLNode* gDragonModel; // Global pointer declared in AppDemoLoad //----------------------------------------------------------------------------- //! Vector getter callback for combo and listbox with std::vector -static auto vectorGetter = [](void* vec, int idx, const char** out_text) +static auto vectorGetter = + [](void* vec, int idx, const char** out_text) { auto& vector = *(SLVstring*)vec; if (idx < 0 || idx >= (int)vector.size()) @@ -125,6 +129,7 @@ SLbool AppDemoGui::showDateAndTime = false; std::time_t AppDemoGui::adjustedTime = 0; SLbool AppDemoGui::_horizonVisuEnabled = false; SLbool AppDemoGui::hideUI = false; +SLstring AppDemoGui::loadingString = ""; // Scene node for Christoffel objects static SLNode* bern = nullptr; @@ -202,7 +207,7 @@ int ftpCallbackXfer(off64_t xfered, void* arg) { int xferedPC = (int)((float)xfered / (float)ftpXferSizeMax * 100.0f); // cout << "Bytes transferred: " << xfered << " (" << xferedPC << ")" << endl; - AppDemo::jobProgressNum(xferedPC); + AppCommon::jobProgressNum(xferedPC); } else cout << "Bytes transferred: " << xfered << endl; @@ -216,17 +221,48 @@ void AppDemoGui::clear() } //----------------------------------------------------------------------------- //! This is the main building function for the GUI of the Demo apps -/*! Is is passed to the AppDemoGui::build function in main of the app-Demo-SLProject +/*! Is is passed to the AppDemoGui::build function in main of the app-demo app. This function will be called once per frame roughly at the end of SLSceneView::onPaint in SLSceneView::draw2DGL by calling ImGui::Render.\n - See also the comments on SLGLImGui. + See also the comments on SLImGui. */ void AppDemoGui::build(SLScene* s, SLSceneView* sv) { PROFILE_FUNCTION(); - assert(s->assetManager() && "No asset manager assigned to scene!"); - SLAssetManager* am = s->assetManager(); + // assert(s->assetManager() && "No asset manager assigned to scene!"); + SLAssetManager* am = AppCommon::assetManager; + + if (!AppCommon::scene) + { + ImGui::Begin("Loading", + nullptr, + ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoBackground | ImGuiWindowFlags_NoNavInputs); + + float width = static_cast(sv->viewportW()); + float height = static_cast(sv->viewportH()); + ImGui::SetWindowSize(ImVec2(width, height)); + ImGui::SetWindowPos(ImVec2(0, 0)); + + ImVec2 center(0.5f * width, 0.5f * height); + + ImDrawList* drawList = ImGui::GetWindowDrawList(); + + drawList->AddRectFilled(ImVec2(0, 0), ImVec2(width, height), IM_COL32(40, 40, 40, 255)); + drawList->AddCircle(center, 50, IM_COL32(105, 125, 145, 255), 0, 10.0f); + + float offset = 8.0f * static_cast(ImGui::GetTime()); + drawList->PathArcTo(center, 50, offset, offset + 0.25f * 2 * PI); + drawList->PathStroke(IM_COL32(250, 165, 0, 255), 0, 10.0f); + + const char* text = loadingString.c_str(); + ImGui::SetCursorPosX(0.5f * (width - ImGui::CalcTextSize(text).x)); + ImGui::SetCursorPosY(0.5f * height + 100.0f); + ImGui::Text(text); + + ImGui::End(); + return; + } if (AppDemoGui::hideUI || (sv->camera() && sv->camera()->projType() == P_stereoSideBySideD)) @@ -241,19 +277,19 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) /////////////////////////////////// // if parallel jobs are running show only the progress information - if (AppDemo::jobIsRunning) + if (AppCommon::jobIsRunning) { centerNextWindow(sv, 0.9f, 0.5f); ImGui::Begin("Parallel Job in Progress", &showProgress, - ImGuiWindowFlags_NoTitleBar); + ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoNavInputs); ImGui::Text("Parallel Job in Progress:"); ImGui::Separator(); - ImGui::Text("%s", AppDemo::jobProgressMsg().c_str()); - if (AppDemo::jobProgressMax() > 0) + ImGui::Text("%s", AppCommon::jobProgressMsg().c_str()); + if (AppCommon::jobProgressMax() > 0) { - float num = (float)AppDemo::jobProgressNum(); - float max = (float)AppDemo::jobProgressMax(); + float num = (float)AppCommon::jobProgressNum(); + float max = (float)AppCommon::jobProgressMax(); ImGui::ProgressBar(num / max); } else @@ -263,9 +299,9 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) ImGui::Separator(); ImGui::Text("Parallel Jobs to follow: %u", - (uint)AppDemo::jobsToBeThreaded.size()); + (uint)AppCommon::jobsToBeThreaded.size()); ImGui::Text("Sequential Jobs to follow: %u", - (uint)AppDemo::jobsToFollowInMain.size()); + (uint)AppCommon::jobsToFollowInMain.size()); ImGui::End(); return; } @@ -279,7 +315,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) // We are using the ImGuiWindowFlags_NoDocking flag to make the parent window not dockable into, // because it would be confusing to have two docking targets within each others. - ImGuiWindowFlags window_flags = ImGuiWindowFlags_NoDocking; + ImGuiWindowFlags window_flags = ImGuiWindowFlags_NoDocking | ImGuiWindowFlags_NoNavInputs; if (opt_fullscreen) { ImGuiViewport* viewport = ImGui::GetMainViewport(); @@ -327,12 +363,12 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showAbout) { centerNextWindow(sv); - ImGui::Begin("About SLProject", &showAbout, ImGuiWindowFlags_NoResize); - ImGui::Text("Version: %s", AppDemo::version.c_str()); - ImGui::Text("Configuration: %s", AppDemo::configuration.c_str()); + ImGui::Begin("About SLProject", &showAbout, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoNavInputs); + ImGui::Text("Version: %s", AppCommon::version.c_str()); + ImGui::Text("Configuration: %s", AppCommon::configuration.c_str()); ImGui::Separator(); - ImGui::Text("Git Branch: %s (Commit: %s)", AppDemo::gitBranch.c_str(), AppDemo::gitCommit.c_str()); - ImGui::Text("Git Date: %s", AppDemo::gitDate.c_str()); + ImGui::Text("Git Branch: %s (Commit: %s)", AppCommon::gitBranch.c_str(), AppCommon::gitCommit.c_str()); + ImGui::Text("Git Date: %s", AppCommon::gitDate.c_str()); ImGui::Separator(); ImGui::TextWrapped("%s", infoAbout.c_str()); ImGui::End(); @@ -342,7 +378,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showHelp) { centerNextWindow(sv); - ImGui::Begin("Help on Interaction", &showHelp, ImGuiWindowFlags_NoResize); + ImGui::Begin("Help on Interaction", &showHelp, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextWrapped("%s", infoHelp.c_str()); ImGui::End(); return; @@ -351,7 +387,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showHelpCalibration) { centerNextWindow(sv); - ImGui::Begin("Help on Camera Calibration", &showHelpCalibration, ImGuiWindowFlags_NoResize); + ImGui::Begin("Help on Camera Calibration", &showHelpCalibration, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextWrapped("%s", infoCalibrate.c_str()); ImGui::End(); return; @@ -360,7 +396,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showCredits) { centerNextWindow(sv); - ImGui::Begin("Credits for all Contributors and external Libraries", &showCredits, ImGuiWindowFlags_NoResize); + ImGui::Begin("Credits for all Contributors and external Libraries", &showCredits, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextWrapped("%s", infoCredits.c_str()); ImGui::End(); return; @@ -437,14 +473,14 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) SLfloat updateDODTimePC = Utils::clamp(updateDODTime / ft * 100.0f, 0.0f, 100.0f); snprintf(m + strlen(m), sizeof(m), " EntityWM : %5.1f ms (%3d%%)\n", updateDODTime, (SLint)updateDODTimePC); #endif - if (!s->animManager().allAnimNames().empty()) + if (!s->animManager().animationNames().empty()) { snprintf(m + strlen(m), sizeof(m), " Anim. : %5.1f ms (%3d%%)\n", updateAnimTime, (SLint)updateAnimTimePC); snprintf(m + strlen(m), sizeof(m), " AABB : %5.1f ms (%3d%%)\n", updateAABBTime, (SLint)updateAABBTimePC); } #ifndef SL_EMSCRIPTEN - if (vt != VT_NONE && tracker != nullptr && trackedNode != nullptr) + if (vt != VT_NONE && gVideoTracker != nullptr && gVideoTrackedNode != nullptr) { snprintf(m + strlen(m), sizeof(m), " Tracking : %5.1f ms (%3d%%)\n", trackingTime, (SLint)trackingTimePC); snprintf(m + strlen(m), sizeof(m), " Detect : %5.1f ms (%3d%%)\n", detectTime, (SLint)detectTimePC); @@ -533,7 +569,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) } ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Timing", &showStatsTiming, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("Timing", &showStatsTiming, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextUnformatted(m); ImGui::End(); ImGui::PopFont(); @@ -552,13 +588,13 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) SLfloat avgTriPerVox = vox > 0.0f ? numRTTria / (vox - voxEmpty) : 0.0f; SLint numOverdrawnNodes = (int)sv->nodesOverdrawn().size(); SLint numVisibleNodes = (int)(stats3D.numNodesOpaque + stats3D.numNodesBlended + numOverdrawnNodes); - SLint numGroupPC = (SLint)((SLfloat)stats3D.numNodesGroup / (SLfloat)stats3D.numNodes * 100.0f); - SLint numLeafPC = (SLint)((SLfloat)stats3D.numNodesLeaf / (SLfloat)stats3D.numNodes * 100.0f); - SLint numLightsPC = (SLint)((SLfloat)stats3D.numLights / (SLfloat)stats3D.numNodes * 100.0f); - SLint numOpaquePC = (SLint)((SLfloat)stats3D.numNodesOpaque / (SLfloat)stats3D.numNodes * 100.0f); - SLint numBlendedPC = (SLint)((SLfloat)stats3D.numNodesBlended / (SLfloat)stats3D.numNodes * 100.0f); - SLint numOverdrawnPC = (SLint)((SLfloat)numOverdrawnNodes / (SLfloat)stats3D.numNodes * 100.0f); - SLint numVisiblePC = (SLint)((SLfloat)numVisibleNodes / (SLfloat)stats3D.numNodes * 100.0f); + SLint numGroupPC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)stats3D.numNodesGroup / (SLfloat)stats3D.numNodes * 100.0f); + SLint numLeafPC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)stats3D.numNodesLeaf / (SLfloat)stats3D.numNodes * 100.0f); + SLint numLightsPC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)stats3D.numLights / (SLfloat)stats3D.numNodes * 100.0f); + SLint numOpaquePC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)stats3D.numNodesOpaque / (SLfloat)stats3D.numNodes * 100.0f); + SLint numBlendedPC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)stats3D.numNodesBlended / (SLfloat)stats3D.numNodes * 100.0f); + SLint numOverdrawnPC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)numOverdrawnNodes / (SLfloat)stats3D.numNodes * 100.0f); + SLint numVisiblePC = stats3D.numNodes == 0 ? 0 : (SLint)((SLfloat)numVisibleNodes / (SLfloat)stats3D.numNodes * 100.0f); // Calculate total size of texture bytes on CPU SLfloat cpuMBTexture = 0; @@ -570,16 +606,15 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) SLfloat cpuMBMeshes = (SLfloat)stats3D.numBytes / 1E6f; SLfloat cpuMBVoxels = (SLfloat)stats3D.numBytesAccel / 1E6f; SLfloat cpuMBTotal = cpuMBTexture + cpuMBMeshes + cpuMBVoxels; - SLint cpuMBTexturePC = (SLint)(cpuMBTexture / cpuMBTotal * 100.0f); - SLint cpuMBMeshesPC = (SLint)(cpuMBMeshes / cpuMBTotal * 100.0f); - SLint cpuMBVoxelsPC = (SLint)(cpuMBVoxels / cpuMBTotal * 100.0f); + SLint cpuMBTexturePC = std::abs(cpuMBTotal) < 1E-5f ? 0 : (SLint)(cpuMBTexture / cpuMBTotal * 100.0f); + SLint cpuMBMeshesPC = std::abs(cpuMBTotal) < 1E-5f ? 0 : (SLint)(cpuMBMeshes / cpuMBTotal * 100.0f); + SLint cpuMBVoxelsPC = std::abs(cpuMBTotal) < 1E-5f ? 0 : (SLint)(cpuMBVoxels / cpuMBTotal * 100.0f); SLfloat gpuMBTexture = (SLfloat)SLGLTexture::totalNumBytesOnGPU / 1E6f; SLfloat gpuMBVbo = (SLfloat)SLGLVertexBuffer::totalBufferSize / 1E6f; SLfloat gpuMBTotal = gpuMBTexture + gpuMBVbo; - SLint gpuMBTexturePC = (SLint)(gpuMBTexture / gpuMBTotal * 100.0f); - SLint gpuMBVboPC = (SLint)(gpuMBVbo / gpuMBTotal * 100.0f); + SLint gpuMBTexturePC = std::abs(gpuMBTotal) < 1E-5 ? 0 : (SLint)(gpuMBTexture / gpuMBTotal * 100.0f); + SLint gpuMBVboPC = std::abs(gpuMBTotal) < 1E-5 ? 0 : (SLint)(gpuMBVbo / gpuMBTotal * 100.0f); - snprintf(m + strlen(m), sizeof(m), "Name: %s\n", s->name().c_str()); snprintf(m + strlen(m), sizeof(m), "No. of Nodes :%5d (100%%)\n", stats3D.numNodes); snprintf(m + strlen(m), sizeof(m), "- Group Nodes :%5d (%3d%%)\n", stats3D.numNodesGroup, numGroupPC); snprintf(m + strlen(m), sizeof(m), "- Leaf Nodes :%5d (%3d%%)\n", stats3D.numNodesLeaf, numLeafPC); @@ -605,11 +640,11 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) // Switch to fixed font ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Scene Statistics", &showStatsScene, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("Scene Statistics", &showStatsScene, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); + ImGui::Text("%s (%d)", s->name().c_str(), AppCommon::sceneID); + ImGui::Separator(); ImGui::TextUnformatted(m); - ImGui::Separator(); - ImGui::Text("Global Resources:"); string label = "Meshes (" + std::to_string(am->meshes().size()) + ")"; @@ -753,18 +788,18 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) snprintf(m + strlen(m), sizeof(m), "Calib. state : %s\n", c->stateStr().c_str()); snprintf(m + strlen(m), sizeof(m), "Num. caps : %d\n", c->numCapturedImgs()); - if (vt != VT_NONE && tracker != nullptr && trackedNode != nullptr) + if (vt != VT_NONE && gVideoTracker != nullptr && gVideoTrackedNode != nullptr) { snprintf(m + strlen(m), sizeof(m), "-------------:\n"); - if (typeid(*trackedNode) == typeid(SLCamera)) + if (typeid(*gVideoTrackedNode) == typeid(SLCamera)) { - SLVec3f cameraPos = trackedNode->updateAndGetWM().translation(); + SLVec3f cameraPos = gVideoTrackedNode->updateAndGetWM().translation(); snprintf(m + strlen(m), sizeof(m), "Dist. to zero: %4.2f\n", cameraPos.length()); } else { SLVec3f cameraPos = ((SLNode*)sv->camera())->updateAndGetWM().translation(); - SLVec3f objectPos = trackedNode->updateAndGetWM().translation(); + SLVec3f objectPos = gVideoTrackedNode->updateAndGetWM().translation(); SLVec3f camToObj = objectPos - cameraPos; snprintf(m + strlen(m), sizeof(m), "Dist. to obj.: %4.2f\n", camToObj.length()); } @@ -772,16 +807,16 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) // Switch to fixed font ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Video", &showStatsVideo, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("Video", &showStatsVideo, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextUnformatted(m); ImGui::End(); ImGui::PopFont(); } #ifdef SL_BUILD_WAI - if (showStatsWAI && AppDemo::sceneID == SID_VideoTrackWAI) + if (showStatsWAI && AppCommon::sceneID == SID_VideoTrackWAI) { ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("WAI Statistics", &showStatsWAI, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("WAI Statistics", &showStatsWAI, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); if (!AverageTiming::instance().empty()) { @@ -810,12 +845,13 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) window_flags |= ImGuiWindowFlags_NoTitleBar; window_flags |= ImGuiWindowFlags_NoResize; window_flags |= ImGuiWindowFlags_NoScrollbar; + window_flags |= ImGuiWindowFlags_NoNavInputs; SLfloat w = (SLfloat)sv->viewportW(); ImVec2 size = ImGui::CalcTextSize(s->info().c_str(), nullptr, true, w); - SLfloat h = size.y + SLGLImGui::fontPropDots * 2.0f; + SLfloat h = size.y + SLImGui::fontPropDots * 2.0f; SLstring info = "Scene Info: " + s->info(); ImGui::SetNextWindowPos(ImVec2(0, (float)sv->scrH() - h)); @@ -972,7 +1008,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) SLchar m[2550]; // message character array m[0] = 0; // set zero length - snprintf(m + strlen(m), sizeof(m), "SLProject Version: %s\n", AppDemo::version.c_str()); + snprintf(m + strlen(m), sizeof(m), "SLProject Version: %s\n", AppCommon::version.c_str()); #ifdef _DEBUG snprintf(m + strlen(m), sizeof(m), "Build Config. : Debug\n"); #else @@ -1010,7 +1046,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) // Switch to fixed font ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Device Informations", &showInfosDevice, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("Device Informations", &showInfosDevice, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextUnformatted(m); ImGui::End(); ImGui::PopFont(); @@ -1020,32 +1056,32 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) { SLchar m[1024]; // message character array m[0] = 0; // set zero length - SLVec3d offsetToOrigin = AppDemo::devLoc.originENU() - AppDemo::devLoc.locENU(); - snprintf(m + strlen(m), sizeof(m), "Uses IMU Senor : %s\n", AppDemo::devRot.isUsed() ? "yes" : "no"); - snprintf(m + strlen(m), sizeof(m), "Pitch (deg) : %3.1f\n", AppDemo::devRot.pitchDEG()); - snprintf(m + strlen(m), sizeof(m), "Yaw (deg) : %3.1f\n", AppDemo::devRot.yawDEG()); - snprintf(m + strlen(m), sizeof(m), "Roll (deg) : %3.1f\n", AppDemo::devRot.rollDEG()); - snprintf(m + strlen(m), sizeof(m), "No. averaged : %d\n", AppDemo::devRot.numAveraged()); - // snprintf(m + strlen(m), sizeof(m), "Pitch Offset(deg): %3.1f\n", AppDemo::devRot.pitchOffsetDEG()); - // snprintf(m + strlen(m), sizeof(m), "Yaw Offset(deg): %3.1f\n", AppDemo::devRot.yawOffsetDEG()); - snprintf(m + strlen(m), sizeof(m), "Rot. Offset mode : %s\n", AppDemo::devRot.offsetModeStr().c_str()); + SLVec3d offsetToOrigin = AppCommon::devLoc.originENU() - AppCommon::devLoc.locENU(); + snprintf(m + strlen(m), sizeof(m), "Uses IMU Senor : %s\n", AppCommon::devRot.isUsed() ? "yes" : "no"); + snprintf(m + strlen(m), sizeof(m), "Pitch (deg) : %3.1f\n", AppCommon::devRot.pitchDEG()); + snprintf(m + strlen(m), sizeof(m), "Yaw (deg) : %3.1f\n", AppCommon::devRot.yawDEG()); + snprintf(m + strlen(m), sizeof(m), "Roll (deg) : %3.1f\n", AppCommon::devRot.rollDEG()); + snprintf(m + strlen(m), sizeof(m), "No. averaged : %d\n", AppCommon::devRot.numAveraged()); + // snprintf(m + strlen(m), sizeof(m), "Pitch Offset(deg): %3.1f\n", AppCommon::devRot.pitchOffsetDEG()); + // snprintf(m + strlen(m), sizeof(m), "Yaw Offset(deg): %3.1f\n", AppCommon::devRot.yawOffsetDEG()); + snprintf(m + strlen(m), sizeof(m), "Rot. Offset mode : %s\n", AppCommon::devRot.offsetModeStr().c_str()); snprintf(m + strlen(m), sizeof(m), "------------------\n"); - snprintf(m + strlen(m), sizeof(m), "Uses GPS Sensor : %s\n", AppDemo::devLoc.isUsed() ? "yes" : "no"); - snprintf(m + strlen(m), sizeof(m), "Latitude (deg) : %10.5f\n", AppDemo::devLoc.locLatLonAlt().lat); - snprintf(m + strlen(m), sizeof(m), "Longitude (deg) : %10.5f\n", AppDemo::devLoc.locLatLonAlt().lon); - snprintf(m + strlen(m), sizeof(m), "Alt. used (m) : %10.2f\n", AppDemo::devLoc.locLatLonAlt().alt); - snprintf(m + strlen(m), sizeof(m), "Alt. GPS (m) : %10.2f\n", AppDemo::devLoc.altGpsM()); - snprintf(m + strlen(m), sizeof(m), "Alt. DEM (m) : %10.2f\n", AppDemo::devLoc.altDemM()); - snprintf(m + strlen(m), sizeof(m), "Alt. origin (m) : %10.2f\n", AppDemo::devLoc.altDemM()); - snprintf(m + strlen(m), sizeof(m), "Accuracy Rad.(m) : %6.1f\n", AppDemo::devLoc.locAccuracyM()); + snprintf(m + strlen(m), sizeof(m), "Uses GPS Sensor : %s\n", AppCommon::devLoc.isUsed() ? "yes" : "no"); + snprintf(m + strlen(m), sizeof(m), "Latitude (deg) : %10.5f\n", AppCommon::devLoc.locLatLonAlt().lat); + snprintf(m + strlen(m), sizeof(m), "Longitude (deg) : %10.5f\n", AppCommon::devLoc.locLatLonAlt().lon); + snprintf(m + strlen(m), sizeof(m), "Alt. used (m) : %10.2f\n", AppCommon::devLoc.locLatLonAlt().alt); + snprintf(m + strlen(m), sizeof(m), "Alt. GPS (m) : %10.2f\n", AppCommon::devLoc.altGpsM()); + snprintf(m + strlen(m), sizeof(m), "Alt. DEM (m) : %10.2f\n", AppCommon::devLoc.altDemM()); + snprintf(m + strlen(m), sizeof(m), "Alt. origin (m) : %10.2f\n", AppCommon::devLoc.altDemM()); + snprintf(m + strlen(m), sizeof(m), "Accuracy Rad.(m) : %6.1f\n", AppCommon::devLoc.locAccuracyM()); snprintf(m + strlen(m), sizeof(m), "Dist. Origin (m) : %6.1f\n", offsetToOrigin.length()); - snprintf(m + strlen(m), sizeof(m), "Origin improve(s): %6.1f sec.\n", AppDemo::devLoc.improveTime()); - snprintf(m + strlen(m), sizeof(m), "Loc. Offset mode : %s\n", AppDemo::devLoc.offsetModeStr().c_str()); - snprintf(m + strlen(m), sizeof(m), "Loc. Offset (m) : %s\n", AppDemo::devLoc.offsetENU().toString(",", 1).c_str()); + snprintf(m + strlen(m), sizeof(m), "Origin improve(s): %6.1f sec.\n", AppCommon::devLoc.improveTime()); + snprintf(m + strlen(m), sizeof(m), "Loc. Offset mode : %s\n", AppCommon::devLoc.offsetModeStr().c_str()); + snprintf(m + strlen(m), sizeof(m), "Loc. Offset (m) : %s\n", AppCommon::devLoc.offsetENU().toString(",", 1).c_str()); // Switch to fixed font ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Sensor Information", &showInfosSensors, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGui::Begin("Sensor Information", &showInfosSensors, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); ImGui::TextUnformatted(m); ImGui::End(); ImGui::PopFont(); @@ -1065,15 +1101,19 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) { ImGuiWindowFlags window_flags = 0; window_flags |= ImGuiWindowFlags_AlwaysAutoResize; + window_flags |= ImGuiWindowFlags_NoNavInputs; + ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); ImGui::Begin("User Interface Preferences", &showUIPrefs, window_flags); ImGui::PushItemWidth(ImGui::GetWindowWidth() * 0.66f); - ImGui::SliderFloat("Prop. Font Size", &SLGLImGui::fontPropDots, 16.f, 70.f, "%0.0f"); - ImGui::SliderFloat("Fixed Font Size", &SLGLImGui::fontFixedDots, 13.f, 50.f, "%0.0f"); + ImGui::SliderFloat("Prop. Font Size", &SLImGui::fontPropDots, 16.f, 70.f, "%0.0f"); + ImGui::SliderFloat("Fixed Font Size", &SLImGui::fontFixedDots, 13.f, 50.f, "%0.0f"); ImGuiStyle& style = ImGui::GetStyle(); + if (ImGui::SliderFloat("Item Spacing X", &style.ItemSpacing.x, 0.0f, 20.0f, "%0.0f")) style.WindowPadding.x = style.FramePadding.x = style.ItemInnerSpacing.x = style.ItemSpacing.x; + if (ImGui::SliderFloat("Item Spacing Y", &style.ItemSpacing.y, 0.0f, 20.0f, "%0.0f")) { style.FramePadding.y = style.ItemInnerSpacing.y = style.ItemSpacing.y; @@ -1090,7 +1130,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) snprintf(reset, sizeof(reset), "Reset User Interface (DPI: %d)", sv->dpi()); if (ImGui::MenuItem(reset)) { - SLstring fullPathFilename = AppDemo::configPath + "DemoGui.yml"; + SLstring fullPathFilename = AppCommon::configPath + "DemoGui.yml"; Utils::deleteFile(fullPathFilename); loadConfig(sv->dpi()); } @@ -1102,11 +1142,13 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showDateAndTime) { - if (AppDemo::devLoc.originLatLonAlt() != SLVec3d::ZERO || - AppDemo::devLoc.defaultLatLonAlt() != SLVec3d::ZERO) + if (AppCommon::devLoc.originLatLonAlt() != SLVec3d::ZERO || + AppCommon::devLoc.defaultLatLonAlt() != SLVec3d::ZERO) { ImGuiWindowFlags window_flags = 0; window_flags |= ImGuiWindowFlags_AlwaysAutoResize; + window_flags |= ImGuiWindowFlags_NoNavInputs; + ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); ImGui::Begin("Date and Time Settings", &showDateAndTime, window_flags); ImGui::PushItemWidth(ImGui::GetWindowWidth() * 0.66f); @@ -1125,27 +1167,27 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) { lt.tm_mon = month - 1; adjustedTime = mktime(<); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), - adjustedTime); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + adjustedTime); } if (ImGui::SliderInt("Day", <.tm_mday, 1, 31)) { adjustedTime = mktime(<); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), - adjustedTime); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + adjustedTime); } - SLfloat SRh = AppDemo::devLoc.originSolarSunrise(); - SLfloat SSh = AppDemo::devLoc.originSolarSunset(); + SLfloat SRh = AppCommon::devLoc.originSolarSunrise(); + SLfloat SSh = AppCommon::devLoc.originSolarSunset(); SLfloat nowF = (SLfloat)lt.tm_hour + (float)lt.tm_min / 60.0f; if (ImGui::SliderFloat("Hour", &nowF, SRh, SSh, "%.2f")) { lt.tm_hour = (int)nowF; - lt.tm_min = (int)((nowF - (int)nowF) * 60.0f); + lt.tm_min = (int)((nowF - floor(nowF)) * 60.0f); adjustedTime = mktime(<); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), - adjustedTime); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + adjustedTime); } SLchar strTime[100]; @@ -1157,7 +1199,7 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) { adjustedTime = 0; memcpy(<, std::localtime(&now), sizeof(tm)); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), now); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), now); } snprintf(strTime, sizeof(strTime), "Set highest noon (21.07.%02d 12:00)", lt.tm_year - 100); @@ -1169,8 +1211,8 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) lt.tm_min = 0; lt.tm_sec = 0; adjustedTime = mktime(<); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), - adjustedTime); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + adjustedTime); } snprintf(strTime, sizeof(strTime), "Set lowest noon (21.12.%02d 12:00)", lt.tm_year - 100); @@ -1182,11 +1224,11 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) lt.tm_min = 0; lt.tm_sec = 0; adjustedTime = mktime(<); - AppDemo::devLoc.calculateSolarAngles(AppDemo::devLoc.originLatLonAlt(), - adjustedTime); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + adjustedTime); } - SLNode* sunLightNode = AppDemo::devLoc.sunLightNode(); + SLNode* sunLightNode = AppCommon::devLoc.sunLightNode(); if (sunLightNode && typeid(*sunLightNode) == typeid(SLLightDirect) && ((SLLightDirect*)sunLightNode)->doSunPowerAdaptation()) @@ -1215,14 +1257,14 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) if (showErlebAR) { ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - SLint namedLocIndex = AppDemo::devLoc.activeNamedLocation(); + SLint namedLocIndex = AppCommon::devLoc.activeNamedLocation(); SLVec3f lookAtPoint = SLVec3f::ZERO; - if (AppDemo::sceneID == SID_ErlebARBernChristoffel) + if (AppCommon::sceneID == SID_ErlebAR_BernChristoffel) { ImGui::Begin("Christoffel", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); // Get scene nodes once if (!bern) @@ -1257,15 +1299,15 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) ImGui::Separator(); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif lookAtPoint.set(-21, 18, 6); - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv, lookAtPoint); } @@ -1279,102 +1321,100 @@ void AppDemoGui::build(SLScene* s, SLSceneView* sv) balda_stahl = nullptr; balda_glas = nullptr; } - if (AppDemo::sceneID == SID_ErlebARAugustaRauricaTmpTht || - AppDemo::sceneID == SID_ErlebARAugustaRauricaTht || - AppDemo::sceneID == SID_ErlebARAugustaRauricaTmp) + if (AppCommon::sceneID == SID_ErlebAR_AugustaRauricaTmpTht) { ImGui::Begin("Augst-Theatre-Temple", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv); } ImGui::End(); } - if (AppDemo::sceneID == SID_ErlebARAventicumAmphiteatre) + if (AppCommon::sceneID == SID_ErlebAR_AventicumAmphiteatre) { ImGui::Begin("Avenche-Amphitheatre", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv); } ImGui::End(); } - if (AppDemo::sceneID == SID_ErlebARAventicumCigognier) + if (AppCommon::sceneID == SID_ErlebAR_AventicumCigognier) { ImGui::Begin("Avenche-Cigognier", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv, lookAtPoint); } ImGui::End(); } - if (AppDemo::sceneID == SID_ErlebARAventicumTheatre) + if (AppCommon::sceneID == SID_ErlebAR_AventicumTheatre) { ImGui::Begin("Avenche-Theatre", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv); } ImGui::End(); } - if (AppDemo::sceneID == SID_ErlebARSutzKirchrain18) + if (AppCommon::sceneID == SID_ErlebAR_SutzKirchrain18) { ImGui::Begin("Sutz-Kirchrain18", &showErlebAR, - ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize); + ImGuiWindowFlags_NoResize | ImGuiWindowFlags_AlwaysAutoResize | ImGuiWindowFlags_NoNavInputs); #if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) - bool devLocIsUsed = AppDemo::devLoc.isUsed(); + bool devLocIsUsed = AppCommon::devLoc.isUsed(); if (ImGui::Checkbox("Use GPS Location", &devLocIsUsed)) - AppDemo::devLoc.isUsed(true); + AppCommon::devLoc.isUsed(true); #endif - for (int i = 1; i < AppDemo::devLoc.nameLocations().size(); ++i) + for (int i = 1; i < AppCommon::devLoc.nameLocations().size(); ++i) { bool namedLocIsActive = namedLocIndex == i; - if (ImGui::Checkbox(AppDemo::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) + if (ImGui::Checkbox(AppCommon::devLoc.nameLocations()[i].name.c_str(), &namedLocIsActive)) setActiveNamedLocation(i, sv); } @@ -1394,9 +1434,9 @@ CVCalibration guessCalibration(bool mirroredH, { #ifndef SL_EMSCRIPTEN // Try to read device lens and sensor information - string strF = AppDemo::deviceParameter["DeviceLensFocalLength"]; - string strW = AppDemo::deviceParameter["DeviceSensorPhysicalSizeW"]; - string strH = AppDemo::deviceParameter["DeviceSensorPhysicalSizeH"]; + string strF = AppCommon::deviceParameter["DeviceLensFocalLength"]; + string strW = AppCommon::deviceParameter["DeviceSensorPhysicalSizeW"]; + string strH = AppCommon::deviceParameter["DeviceSensorPhysicalSizeH"]; if (!strF.empty() && !strW.empty() && !strH.empty()) { float devF = strF.empty() ? 0.0f : stof(strF); @@ -1441,14 +1481,14 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) { PROFILE_FUNCTION(); - assert(s->assetManager() && "No asset manager assigned to scene!"); - SLAssetManager* am = s->assetManager(); + // assert(s->assetManager() && "No asset manager assigned to scene!"); + SLAssetManager* am = AppCommon::assetManager; - SLSceneID sid = AppDemo::sceneID; + SLSceneID sid = AppCommon::sceneID; SLGLState* stateGL = SLGLState::instance(); CVCapture* capture = CVCapture::instance(); SLRenderType rType = sv->renderType(); - SLbool hasAnimations = (!s->animManager().allAnimNames().empty()); + SLbool hasAnimations = (!s->animManager().animationNames().empty()); static SLint curAnimIx = -1; if (!hasAnimations) curAnimIx = -1; @@ -1465,29 +1505,29 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("General")) { if (ImGui::MenuItem("Minimal Scene", nullptr, sid == SID_Minimal)) - s->onLoad(am, s, sv, SID_Minimal); + AppCommon::sceneToLoad = SID_Minimal; if (ImGui::MenuItem("Figure Scene", nullptr, sid == SID_Figure)) - s->onLoad(am, s, sv, SID_Figure); + AppCommon::sceneToLoad = SID_Figure; if (ImGui::MenuItem("Mesh Loader", nullptr, sid == SID_MeshLoad)) - s->onLoad(am, s, sv, SID_MeshLoad); + AppCommon::sceneToLoad = SID_MeshLoad; if (ImGui::MenuItem("Revolver Meshes", nullptr, sid == SID_Revolver)) - s->onLoad(am, s, sv, SID_Revolver); + AppCommon::sceneToLoad = SID_Revolver; if (ImGui::MenuItem("Texture Blending", nullptr, sid == SID_TextureBlend)) - s->onLoad(am, s, sv, SID_TextureBlend); + AppCommon::sceneToLoad = SID_TextureBlend; if (ImGui::MenuItem("Texture Filters", nullptr, sid == SID_TextureFilter)) - s->onLoad(am, s, sv, SID_TextureFilter); + AppCommon::sceneToLoad = SID_TextureFilter; #ifdef SL_BUILD_WITH_KTX if (ImGui::MenuItem("Texture Compression", nullptr, sid == SID_TextureCompression)) - s->onLoad(am, s, sv, SID_TextureCompression); + AppCommon::sceneToLoad = SID_TextureCompression; #endif if (ImGui::MenuItem("Frustum Culling", nullptr, sid == SID_FrustumCull)) - s->onLoad(am, s, sv, SID_FrustumCull); + AppCommon::sceneToLoad = SID_FrustumCull; if (ImGui::MenuItem("2D and 3D Text", nullptr, sid == SID_2Dand3DText)) - s->onLoad(am, s, sv, SID_2Dand3DText); + AppCommon::sceneToLoad = SID_2Dand3DText; if (ImGui::MenuItem("Point Clouds", nullptr, sid == SID_PointClouds)) - s->onLoad(am, s, sv, SID_PointClouds); + AppCommon::sceneToLoad = SID_PointClouds; if (ImGui::MenuItem("Z-Fighting", nullptr, sid == SID_ZFighting)) - s->onLoad(am, s, sv, SID_ZFighting); + AppCommon::sceneToLoad = SID_ZFighting; ImGui::EndMenu(); } @@ -1495,42 +1535,42 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Shader")) { if (ImGui::MenuItem("Per Vertex Blinn-Phong", nullptr, sid == SID_ShaderPerVertexBlinn)) - s->onLoad(am, s, sv, SID_ShaderPerVertexBlinn); + AppCommon::sceneToLoad = SID_ShaderPerVertexBlinn; if (ImGui::MenuItem("Per Pixel Blinn-Phong", nullptr, sid == SID_ShaderPerPixelBlinn)) - s->onLoad(am, s, sv, SID_ShaderPerPixelBlinn); + AppCommon::sceneToLoad = SID_ShaderPerPixelBlinn; if (ImGui::MenuItem("Per Pixel Cook-Torrance", nullptr, sid == SID_ShaderPerPixelCook)) - s->onLoad(am, s, sv, SID_ShaderPerPixelCook); + AppCommon::sceneToLoad = SID_ShaderPerPixelCook; if (ImGui::MenuItem("Image Based Lighting", nullptr, sid == SID_ShaderIBL)) - s->onLoad(am, s, sv, SID_ShaderIBL); - if (ImGui::MenuItem("Per Vertex Wave", nullptr, sid == SID_ShaderPerVertexWave)) - s->onLoad(am, s, sv, SID_ShaderPerVertexWave); + AppCommon::sceneToLoad = SID_ShaderIBL; + if (ImGui::MenuItem("Per Vertex Wave", nullptr, sid == SID_ShaderWave)) + AppCommon::sceneToLoad = SID_ShaderWave; if (ImGui::MenuItem("Bump Mapping", nullptr, sid == SID_ShaderBumpNormal)) - s->onLoad(am, s, sv, SID_ShaderBumpNormal); + AppCommon::sceneToLoad = SID_ShaderBumpNormal; if (ImGui::MenuItem("Parallax Mapping", nullptr, sid == SID_ShaderBumpParallax)) - s->onLoad(am, s, sv, SID_ShaderBumpParallax); - if (ImGui::MenuItem("Skybox Shader", nullptr, sid == SID_ShaderSkyBox)) - s->onLoad(am, s, sv, SID_ShaderSkyBox); + AppCommon::sceneToLoad = SID_ShaderBumpParallax; + if (ImGui::MenuItem("Skybox Shader", nullptr, sid == SID_ShaderSkybox)) + AppCommon::sceneToLoad = SID_ShaderSkybox; if (ImGui::MenuItem("Earth Shader", nullptr, sid == SID_ShaderEarth)) - s->onLoad(am, s, sv, SID_ShaderEarth); + AppCommon::sceneToLoad = SID_ShaderEarth; ImGui::EndMenu(); } if (ImGui::BeginMenu("Shadow Mapping")) { if (ImGui::MenuItem("Basic Scene", nullptr, sid == SID_ShadowMappingBasicScene)) - s->onLoad(am, s, sv, SID_ShadowMappingBasicScene); + AppCommon::sceneToLoad = SID_ShadowMappingBasicScene; if (ImGui::MenuItem("Light Types", nullptr, sid == SID_ShadowMappingLightTypes)) - s->onLoad(am, s, sv, SID_ShadowMappingLightTypes); + AppCommon::sceneToLoad = SID_ShadowMappingLightTypes; if (ImGui::MenuItem("8 Spot Lights", nullptr, sid == SID_ShadowMappingSpotLights)) - s->onLoad(am, s, sv, SID_ShadowMappingSpotLights); + AppCommon::sceneToLoad = SID_ShadowMappingSpotLights; if (ImGui::MenuItem("3 Point Lights", nullptr, sid == SID_ShadowMappingPointLights)) - s->onLoad(am, s, sv, SID_ShadowMappingPointLights); + AppCommon::sceneToLoad = SID_ShadowMappingPointLights; if (ImGui::MenuItem("RT Soft Shadows", nullptr, sid == SID_RTSoftShadows)) - s->onLoad(am, s, sv, SID_RTSoftShadows); + AppCommon::sceneToLoad = SID_RTSoftShadows; if (ImGui::MenuItem("Cascaded Shadows", nullptr, sid == SID_ShadowMappingCascaded)) - s->onLoad(am, s, sv, SID_ShadowMappingCascaded); - if (ImGui::MenuItem("Columns with Cascaded Sh.", nullptr, sid == SID_Benchmark6_ColumnsLOD)) - s->onLoad(am, s, sv, SID_Benchmark6_ColumnsLOD); + AppCommon::sceneToLoad = SID_ShadowMappingCascaded; + if (ImGui::MenuItem("Columns with Cascaded Sh.", nullptr, sid == SID_Benchmark_ColumnsLOD)) + AppCommon::sceneToLoad = SID_Benchmark_ColumnsLOD; ImGui::EndMenu(); } @@ -1538,33 +1578,35 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Suzanne Lighting")) { if (ImGui::MenuItem("w. per Pixel Lighting (PL)", nullptr, sid == SID_SuzannePerPixBlinn)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinn); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinn; if (ImGui::MenuItem("w. PL and Texture Mapping (TM)", nullptr, sid == SID_SuzannePerPixBlinnTm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTm; if (ImGui::MenuItem("w. PL and Normal Mapping (NM)", nullptr, sid == SID_SuzannePerPixBlinnNm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnNm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnNm; if (ImGui::MenuItem("w. PL and Ambient Occlusion (AO)", nullptr, sid == SID_SuzannePerPixBlinnAo)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnAo); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnAo; if (ImGui::MenuItem("w. PL and Shadow Mapping (SM)", nullptr, sid == SID_SuzannePerPixBlinnSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnSm; if (ImGui::MenuItem("w. PL, TM, NM", nullptr, sid == SID_SuzannePerPixBlinnTmNm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmNm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmNm; if (ImGui::MenuItem("w. PL, TM, AO", nullptr, sid == SID_SuzannePerPixBlinnTmAo)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmAo); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmAo; if (ImGui::MenuItem("w. PL, NM, AO", nullptr, sid == SID_SuzannePerPixBlinnNmAo)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnNmAo); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnNmAo; if (ImGui::MenuItem("w. PL, NM, SM", nullptr, sid == SID_SuzannePerPixBlinnNmSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnNmSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnNmSm; if (ImGui::MenuItem("w. PL, TM, SM", nullptr, sid == SID_SuzannePerPixBlinnTmSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmSm; if (ImGui::MenuItem("w. PL, AO, SM", nullptr, sid == SID_SuzannePerPixBlinnAoSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnAoSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnAoSm; if (ImGui::MenuItem("w. PL, TM, NM, AO", nullptr, sid == SID_SuzannePerPixBlinnTmNmAo)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmNmAo); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmNmAo; if (ImGui::MenuItem("w. PL, TM, NM, SM", nullptr, sid == SID_SuzannePerPixBlinnTmNmSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmNmSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmNmSm; if (ImGui::MenuItem("w. PL, TM, NM, AO, SM", nullptr, sid == SID_SuzannePerPixBlinnTmNmAoSm)) - s->onLoad(am, s, sv, SID_SuzannePerPixBlinnTmNmAoSm); + AppCommon::sceneToLoad = SID_SuzannePerPixBlinnTmNmAoSm; + if (ImGui::MenuItem("w. PL, TM, NM, AO, SM, EM", nullptr, sid == SID_SuzannePerPixCookTmNmAoSmEm)) + AppCommon::sceneToLoad = SID_SuzannePerPixCookTmNmAoSmEm; ImGui::EndMenu(); } @@ -1573,13 +1615,13 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) SLstring zip = "glTF-Sample-Models.zip"; if (ImGui::MenuItem("Damaged Helmet", nullptr, sid == SID_glTF_DamagedHelmet)) - s->onLoad(am, s, sv, SID_glTF_DamagedHelmet); + AppCommon::sceneToLoad = SID_glTF_DamagedHelmet; if (ImGui::MenuItem("Flight Helmet", nullptr, sid == SID_glTF_FlightHelmet)) - s->onLoad(am, s, sv, SID_glTF_FlightHelmet); + AppCommon::sceneToLoad = SID_glTF_FlightHelmet; if (ImGui::MenuItem("Sponza Palace", nullptr, sid == SID_glTF_Sponza)) - s->onLoad(am, s, sv, SID_glTF_Sponza); + AppCommon::sceneToLoad = SID_glTF_Sponza; if (ImGui::MenuItem("Water Bottle", nullptr, sid == SID_glTF_WaterBottle)) - s->onLoad(am, s, sv, SID_glTF_WaterBottle); + AppCommon::sceneToLoad = SID_glTF_WaterBottle; ImGui::EndMenu(); } @@ -1589,7 +1631,7 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) SLstring zip = "GLTF-FanucCRX.zip"; if (ImGui::MenuItem("Fanuc-CRX", nullptr, sid == SID_Robotics_FanucCRX_FK)) - s->onLoad(am, s, sv, SID_Robotics_FanucCRX_FK); + AppCommon::sceneToLoad = SID_Robotics_FanucCRX_FK; ImGui::EndMenu(); } @@ -1597,18 +1639,19 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Volume Rendering")) { if (ImGui::MenuItem("Head MRI Ray Cast", nullptr, sid == SID_VolumeRayCast)) - s->onLoad(am, s, sv, SID_VolumeRayCast); + AppCommon::sceneToLoad = SID_VolumeRayCast; if (ImGui::MenuItem("Head MRI Ray Cast Lighted", nullptr, sid == SID_VolumeRayCastLighted)) + AppCommon::sceneToLoad = SID_VolumeRayCastLighted; + /* { - auto loadMRIImages = []() - { - AppDemo::jobProgressMsg("Load MRI Images"); - AppDemo::jobProgressMax(100); + auto loadMRIImages = []() { + AppCommon::jobProgressMsg("Load MRI Images"); + AppCommon::jobProgressMax(100); // Load volume data into 3D texture SLVstring mriImages; for (SLint i = 0; i < 207; ++i) - mriImages.push_back(AppDemo::texturePath + Utils::formatString("i%04u_0000b.png", i)); + mriImages.push_back(AppCommon::texturePath + Utils::formatString("i%04u_0000b.png", i)); gTexMRI3D = new SLGLTexture(nullptr, mriImages, @@ -1623,56 +1666,51 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) #endif "mri_head_front_to_back", true); - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; - auto calculateGradients = []() - { - AppDemo::jobProgressMsg("Calculate MRI Volume Gradients"); - AppDemo::jobProgressMax(100); + auto calculateGradients = []() { + AppCommon::jobProgressMsg("Calculate MRI Volume Gradients"); + AppCommon::jobProgressMax(100); gTexMRI3D->calc3DGradients(1, - [](int progress) - { AppDemo::jobProgressNum(progress); }); - AppDemo::jobIsRunning = false; + [](int progress) { AppCommon::jobProgressNum(progress); }); + AppCommon::jobIsRunning = false; }; - auto smoothGradients = []() - { - AppDemo::jobProgressMsg("Smooth MRI Volume Gradients"); - AppDemo::jobProgressMax(100); + auto smoothGradients = []() { + AppCommon::jobProgressMsg("Smooth MRI Volume Gradients"); + AppCommon::jobProgressMax(100); gTexMRI3D->smooth3DGradients(1, - [](int progress) - { AppDemo::jobProgressNum(progress); }); - AppDemo::jobIsRunning = false; + [](int progress) { AppCommon::jobProgressNum(progress); }); + AppCommon::jobIsRunning = false; }; - auto followUpJob1 = [](SLAssetManager* am, SLScene* s, SLSceneView* sv) - { - s->onLoad(am, s, sv, SID_VolumeRayCastLighted); + auto followUpJob1 = [](SLAssetManager* am, SLScene* s, SLSceneView* sv) { + AppCommon::sceneToLoad = SID_VolumeRayCastLighted; }; function onLoadScene = bind(followUpJob1, am, s, sv); - AppDemo::jobsToBeThreaded.emplace_back(loadMRIImages); - AppDemo::jobsToBeThreaded.emplace_back(calculateGradients); - // AppDemo::jobsToBeThreaded.emplace_back(smoothGradients); // very slow - AppDemo::jobsToFollowInMain.push_back(onLoadScene); + AppCommon::jobsToBeThreaded.emplace_back(loadMRIImages); + AppCommon::jobsToBeThreaded.emplace_back(calculateGradients); + // AppCommon::jobsToBeThreaded.emplace_back(smoothGradients); // very slow + AppCommon::jobsToFollowInMain.push_back(onLoadScene); } - + */ ImGui::EndMenu(); } if (ImGui::BeginMenu("Animation")) { if (ImGui::MenuItem("Node Animation", nullptr, sid == SID_AnimationNode)) - s->onLoad(am, s, sv, SID_AnimationNode); - if (ImGui::MenuItem("Mass Animation", nullptr, sid == SID_AnimationMass)) - s->onLoad(am, s, sv, SID_AnimationMass); - if (ImGui::MenuItem("Skeletal Animation", nullptr, sid == SID_AnimationSkinned)) - s->onLoad(am, s, sv, SID_AnimationSkinned); - if (ImGui::MenuItem("AstroBoy Army", nullptr, sid == SID_AnimationAstroboyArmy)) - s->onLoad(am, s, sv, SID_AnimationAstroboyArmy); + AppCommon::sceneToLoad = SID_AnimationNode; + if (ImGui::MenuItem("Mass Node Animation", nullptr, sid == SID_AnimationNodeMass)) + AppCommon::sceneToLoad = SID_AnimationNodeMass; + if (ImGui::MenuItem("Skinned Animation", nullptr, sid == SID_AnimationSkinned)) + AppCommon::sceneToLoad = SID_AnimationSkinned; + if (ImGui::MenuItem("Mass Skinned Animation", nullptr, sid == SID_AnimationSkinnedMass)) + AppCommon::sceneToLoad = SID_AnimationSkinnedMass; if (ImGui::MenuItem("Fanuc-CRX", nullptr, sid == SID_Robotics_FanucCRX_FK)) - s->onLoad(am, s, sv, SID_Robotics_FanucCRX_FK); + AppCommon::sceneToLoad = SID_Robotics_FanucCRX_FK; ImGui::EndMenu(); } @@ -1680,36 +1718,36 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Video")) { if (ImGui::MenuItem("Texture from Video Live", nullptr, sid == SID_VideoTextureLive)) - s->onLoad(am, s, sv, SID_VideoTextureLive); + AppCommon::sceneToLoad = SID_VideoTextureLive; #ifndef SL_EMSCRIPTEN if (ImGui::MenuItem("Texture from Video File", nullptr, sid == SID_VideoTextureFile)) - s->onLoad(am, s, sv, SID_VideoTextureFile); + AppCommon::sceneToLoad = SID_VideoTextureFile; #endif if (ImGui::MenuItem("Track ArUco Marker (Main)", nullptr, sid == SID_VideoTrackArucoMain)) - s->onLoad(am, s, sv, SID_VideoTrackArucoMain); + AppCommon::sceneToLoad = SID_VideoTrackArucoMain; if (ImGui::MenuItem("Track ArUco Marker (Scnd)", nullptr, sid == SID_VideoTrackArucoScnd, capture->hasSecondaryCamera)) - s->onLoad(am, s, sv, SID_VideoTrackArucoScnd); + AppCommon::sceneToLoad = SID_VideoTrackArucoScnd; if (ImGui::MenuItem("Track Chessboard (Main)", nullptr, sid == SID_VideoTrackChessMain)) - s->onLoad(am, s, sv, SID_VideoTrackChessMain); + AppCommon::sceneToLoad = SID_VideoTrackChessMain; if (ImGui::MenuItem("Track Chessboard (Scnd)", nullptr, sid == SID_VideoTrackChessScnd, capture->hasSecondaryCamera)) - s->onLoad(am, s, sv, SID_VideoTrackChessScnd); + AppCommon::sceneToLoad = SID_VideoTrackChessScnd; if (ImGui::MenuItem("Track Features (Main)", nullptr, sid == SID_VideoTrackFeature2DMain)) - s->onLoad(am, s, sv, SID_VideoTrackFeature2DMain); + AppCommon::sceneToLoad = SID_VideoTrackFeature2DMain; #ifndef SL_EMSCRIPTEN if (ImGui::MenuItem("Track Face (Main)", nullptr, sid == SID_VideoTrackFaceMain)) - s->onLoad(am, s, sv, SID_VideoTrackFaceMain); + AppCommon::sceneToLoad = SID_VideoTrackFaceMain; if (ImGui::MenuItem("Track Face (Scnd)", nullptr, sid == SID_VideoTrackFaceScnd, capture->hasSecondaryCamera)) - s->onLoad(am, s, sv, SID_VideoTrackFaceScnd); + AppCommon::sceneToLoad = SID_VideoTrackFaceScnd; #endif #ifdef SL_BUILD_WITH_MEDIAPIPE if (ImGui::MenuItem("Track Hands w. Mediapipe (Main)", nullptr, sid == SID_VideoTrackMediaPipeHandsMain)) - s->onLoad(am, s, sv, SID_VideoTrackMediaPipeHandsMain); + AppCommon::sceneToLoad = SID_VideoTrackMediaPipeHandsMain; #endif if (ImGui::MenuItem("Sensor AR (Main)", nullptr, sid == SID_VideoSensorAR)) - s->onLoad(am, s, sv, SID_VideoSensorAR); + AppCommon::sceneToLoad = SID_VideoSensorAR; #ifdef SL_BUILD_WAI if (ImGui::MenuItem("Track WAI (Main)", nullptr, sid == SID_VideoTrackWAI)) - s->onLoad(am, s, sv, SID_VideoTrackWAI); + AppCommon::sceneToLoad = SID_VideoTrackWAI; #endif ImGui::EndMenu(); } @@ -1717,17 +1755,15 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Ray Tracing")) { if (ImGui::MenuItem("Spheres", nullptr, sid == SID_RTSpheres)) - s->onLoad(am, s, sv, SID_RTSpheres); + AppCommon::sceneToLoad = SID_RTSpheres; if (ImGui::MenuItem("Muttenzer Box", nullptr, sid == SID_RTMuttenzerBox)) - s->onLoad(am, s, sv, SID_RTMuttenzerBox); + AppCommon::sceneToLoad = SID_RTMuttenzerBox; if (ImGui::MenuItem("Soft Shadows", nullptr, sid == SID_RTSoftShadows)) - s->onLoad(am, s, sv, SID_RTSoftShadows); + AppCommon::sceneToLoad = SID_RTSoftShadows; if (ImGui::MenuItem("Depth of Field", nullptr, sid == SID_RTDoF)) - s->onLoad(am, s, sv, SID_RTDoF); + AppCommon::sceneToLoad = SID_RTDoF; if (ImGui::MenuItem("Lens Test", nullptr, sid == SID_RTLens)) - s->onLoad(am, s, sv, SID_RTLens); - if (ImGui::MenuItem("RT Test", nullptr, sid == SID_RTTest)) - s->onLoad(am, s, sv, SID_RTTest); + AppCommon::sceneToLoad = SID_RTLens; ImGui::EndMenu(); } @@ -1735,32 +1771,35 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Path Tracing")) { if (ImGui::MenuItem("Muttenzer Box", nullptr, sid == SID_RTMuttenzerBox)) - s->onLoad(am, s, sv, SID_RTMuttenzerBox); + AppCommon::sceneToLoad = SID_RTMuttenzerBox; ImGui::EndMenu(); } if (ImGui::BeginMenu("Particle Systems")) { - if (ImGui::MenuItem("First Particle System", nullptr, sid == SID_ParticleSystem_First)) - s->onLoad(am, s, sv, SID_ParticleSystem_First); - if (ImGui::MenuItem("Demo Particle System", nullptr, sid == SID_ParticleSystem_Demo)) - s->onLoad(am, s, sv, SID_ParticleSystem_Demo); + if (ImGui::MenuItem("First Particle System", nullptr, sid == SID_ParticleSystem_Simple)) + AppCommon::sceneToLoad = SID_ParticleSystem_Simple; if (ImGui::MenuItem("Dust Storm Particle System", nullptr, sid == SID_ParticleSystem_DustStorm)) - s->onLoad(am, s, sv, SID_ParticleSystem_DustStorm); + AppCommon::sceneToLoad = SID_ParticleSystem_DustStorm; if (ImGui::MenuItem("Fountain Particle System", nullptr, sid == SID_ParticleSystem_Fountain)) - s->onLoad(am, s, sv, SID_ParticleSystem_Fountain); + AppCommon::sceneToLoad = SID_ParticleSystem_Fountain; if (ImGui::MenuItem("Sun Particle System", nullptr, sid == SID_ParticleSystem_Sun)) - s->onLoad(am, s, sv, SID_ParticleSystem_Sun); + AppCommon::sceneToLoad = SID_ParticleSystem_Sun; if (ImGui::MenuItem("Ring of Fire Particle System", nullptr, sid == SID_ParticleSystem_RingOfFire)) - s->onLoad(am, s, sv, SID_ParticleSystem_RingOfFire); - if (ImGui::MenuItem("Complex Fire Particle System", nullptr, sid == SID_ParticleSystem_FireComplex)) - s->onLoad(am, s, sv, SID_ParticleSystem_FireComplex); + AppCommon::sceneToLoad = SID_ParticleSystem_RingOfFire; + if (ImGui::MenuItem("Complex Fire Particle System", nullptr, sid == SID_ParticleSystem_ComplexFire)) + AppCommon::sceneToLoad = SID_ParticleSystem_ComplexFire; + if (ImGui::MenuItem("Particle system w. 1 mio. particles", nullptr, sid == SID_ParticleSystem_Many)) + AppCommon::sceneToLoad = SID_ParticleSystem_Many; ImGui::EndMenu(); } - SLstring erlebarPath = AppDemo::dataPath + "erleb-AR/models/"; + // Download content from pallas/home/private/projects/2020.Erleb-AR/erleb-AR-data/productive/models_for_SLProject + // and copy it into AppCommon::dataPath + "erleb-AR/models/ + // This data is copyright protected and can only be accessed with user and password + SLstring erlebarPath = AppCommon::dataPath + "erleb-AR/models/"; SLstring modelBR2 = erlebarPath + "bern/bern-christoffel.gltf"; SLstring modelBFH = erlebarPath + "biel/Biel-BFH-Rolex.gltf"; SLstring modelAR1 = erlebarPath + "augst/augst-thtL1-tmpL2.gltf"; @@ -1781,40 +1820,32 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Erleb-AR")) { if (Utils::fileExists(modelBR2)) - if (ImGui::MenuItem("Bern: Christoffel Tower", nullptr, sid == SID_ErlebARBernChristoffel)) - s->onLoad(am, s, sv, SID_ErlebARBernChristoffel); + if (ImGui::MenuItem("Bern: Christoffel Tower", nullptr, sid == SID_ErlebAR_BernChristoffel)) + AppCommon::sceneToLoad = SID_ErlebAR_BernChristoffel; if (Utils::fileExists(modelBFH)) - if (ImGui::MenuItem("Biel: BFH", nullptr, sid == SID_ErlebARBielBFH)) - s->onLoad(am, s, sv, SID_ErlebARBielBFH); - - if (Utils::fileExists(modelAR1)) - if (ImGui::MenuItem("Augusta Raurica Temple", nullptr, sid == SID_ErlebARAugustaRauricaTmp)) - s->onLoad(am, s, sv, SID_ErlebARAugustaRauricaTmp); - - if (Utils::fileExists(modelAR2)) - if (ImGui::MenuItem("Augusta Raurica Theater", nullptr, sid == SID_ErlebARAugustaRauricaTht)) - s->onLoad(am, s, sv, SID_ErlebARAugustaRauricaTht); + if (ImGui::MenuItem("Biel: BFH", nullptr, sid == SID_ErlebAR_BielBFH)) + AppCommon::sceneToLoad = SID_ErlebAR_BielBFH; if (Utils::fileExists(modelAR3)) - if (ImGui::MenuItem("Augusta Raurica Temple & Theater", nullptr, sid == SID_ErlebARAugustaRauricaTmpTht)) - s->onLoad(am, s, sv, SID_ErlebARAugustaRauricaTmpTht); + if (ImGui::MenuItem("Augusta Raurica Temple & Theater", nullptr, sid == SID_ErlebAR_AugustaRauricaTmpTht)) + AppCommon::sceneToLoad = SID_ErlebAR_AugustaRauricaTmpTht; if (Utils::fileExists(modelAV1_AO)) - if (ImGui::MenuItem("Aventicum: Amphitheatre", nullptr, sid == SID_ErlebARAventicumAmphiteatre)) - s->onLoad(am, s, sv, SID_ErlebARAventicumAmphiteatre); + if (ImGui::MenuItem("Aventicum: Amphitheatre", nullptr, sid == SID_ErlebAR_AventicumAmphiteatre)) + AppCommon::sceneToLoad = SID_ErlebAR_AventicumAmphiteatre; if (Utils::fileExists(modelAV2_AO)) - if (ImGui::MenuItem("Aventicum: Cigognier", nullptr, sid == SID_ErlebARAventicumCigognier)) - s->onLoad(am, s, sv, SID_ErlebARAventicumCigognier); + if (ImGui::MenuItem("Aventicum: Cigognier", nullptr, sid == SID_ErlebAR_AventicumCigognier)) + AppCommon::sceneToLoad = SID_ErlebAR_AventicumCigognier; if (Utils::fileExists(modelAV3)) - if (ImGui::MenuItem("Aventicum: Theatre", nullptr, sid == SID_ErlebARAventicumTheatre)) - s->onLoad(am, s, sv, SID_ErlebARAventicumTheatre); + if (ImGui::MenuItem("Aventicum: Theatre", nullptr, sid == SID_ErlebAR_AventicumTheatre)) + AppCommon::sceneToLoad = SID_ErlebAR_AventicumTheatre; if (Utils::fileExists(modelSU1)) - if (ImGui::MenuItem("Sutz: Kirchrain 18", nullptr, sid == SID_ErlebARSutzKirchrain18)) - s->onLoad(am, s, sv, SID_ErlebARSutzKirchrain18); + if (ImGui::MenuItem("Sutz: Kirchrain 18", nullptr, sid == SID_ErlebAR_SutzKirchrain18)) + AppCommon::sceneToLoad = SID_ErlebAR_SutzKirchrain18; ImGui::EndMenu(); } @@ -1823,18 +1854,17 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Benchmarks")) { #ifndef SL_EMSCRIPTEN - // The large models are too large for emscripten - if (ImGui::MenuItem("Large Model (via FTP)", nullptr, sid == SID_Benchmark1_LargeModel)) + /* The large models are too large for emscripten + if (ImGui::MenuItem("Large Model (via FTP)", nullptr, sid == SID_Benchmark_LargeModel)) { - SLstring largeFile = AppDemo::configPath + "models/xyzrgb_dragon/xyzrgb_dragon.ply"; + SLstring largeFile = AppCommon::configPath + "models/xyzrgb_dragon/xyzrgb_dragon.ply"; if (Utils::fileExists(largeFile)) - s->onLoad(am, s, sv, SID_Benchmark1_LargeModel); + AppCommon::sceneToLoad = SID_Benchmark_LargeModel; else { - auto downloadJobFTP = []() - { - AppDemo::jobProgressMsg("Downloading large dragon file via FTP:"); - AppDemo::jobProgressMax(100); + auto downloadJobFTP = []() { + AppCommon::jobProgressMsg("Downloading large dragon file via FTP:"); + AppCommon::jobProgressMax(100); ftplib ftp; ftp.SetConnmode(ftplib::connmode::port); // enable active mode @@ -1851,10 +1881,10 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) &remoteSize, ftplib::transfermode::image); ftpXferSizeMax = remoteSize; - SLstring dstDir = AppDemo::configPath; + SLstring dstDir = AppCommon::configPath; if (Utils::dirExists(dstDir)) { - SLstring outFile = AppDemo::configPath + "models/xyzrgb_dragon.zip"; + SLstring outFile = AppCommon::configPath + "models/xyzrgb_dragon.zip"; if (!ftp.Get(outFile.c_str(), "xyzrgb_dragon.zip", ftplib::transfermode::image)) @@ -1873,59 +1903,55 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) SL_LOG("*** ERROR: ftp.Connect failed. ***"); ftp.Quit(); - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; - auto unzipJob = [largeFile]() - { - AppDemo::jobProgressMsg("Decompress dragon file:"); - AppDemo::jobProgressMax(-1); - string zipFile = AppDemo::configPath + "models/xyzrgb_dragon.zip"; + auto unzipJob = [largeFile]() { + AppCommon::jobProgressMsg("Decompress dragon file:"); + AppCommon::jobProgressMax(-1); + string zipFile = AppCommon::configPath + "models/xyzrgb_dragon.zip"; if (Utils::fileExists(zipFile)) { ZipUtils::unzip(zipFile, Utils::getPath(zipFile)); Utils::deleteFile(zipFile); } - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; - auto followUpJob1 = [am, s, sv, largeFile]() - { + auto followUpJob1 = [am, s, sv, largeFile]() { if (Utils::fileExists(largeFile)) - s->onLoad(am, s, sv, SID_Benchmark1_LargeModel); + AppCommon::sceneToLoad = SID_Benchmark_LargeModel; }; - AppDemo::jobsToBeThreaded.emplace_back(downloadJobFTP); - AppDemo::jobsToBeThreaded.emplace_back(unzipJob); - AppDemo::jobsToFollowInMain.emplace_back(followUpJob1); + AppCommon::jobsToBeThreaded.emplace_back(downloadJobFTP); + AppCommon::jobsToBeThreaded.emplace_back(unzipJob); + AppCommon::jobsToFollowInMain.emplace_back(followUpJob1); } } - if (ImGui::MenuItem("Large Model (via HTTPS)", nullptr, sid == SID_Benchmark1_LargeModel)) + if (ImGui::MenuItem("Large Model (via HTTPS)", nullptr, sid == SID_Benchmark_LargeModel)) { - SLstring largeFile = AppDemo::configPath + "models/xyzrgb_dragon/xyzrgb_dragon.ply"; - loadSceneWithLargeModel(s, sv, "xyzrgb_dragon.zip", largeFile, SID_Benchmark1_LargeModel); - } - if (ImGui::MenuItem("Large Model", nullptr, sid == SID_Benchmark1_LargeModel)) - s->onLoad(am, s, sv, SID_Benchmark1_LargeModel); + SLstring largeFile = AppCommon::configPath + "models/xyzrgb_dragon/xyzrgb_dragon.ply"; + loadSceneWithLargeModel(s, sv, "xyzrgb_dragon.zip", largeFile, SID_Benchmark_LargeModel); + }*/ #endif - if (ImGui::MenuItem("Massive Nodes", nullptr, sid == SID_Benchmark2_MassiveNodes)) - s->onLoad(am, s, sv, SID_Benchmark2_MassiveNodes); - if (ImGui::MenuItem("Massive Node Animations", nullptr, sid == SID_Benchmark3_NodeAnimations)) - s->onLoad(am, s, sv, SID_Benchmark3_NodeAnimations); - if (ImGui::MenuItem("Jan's Universe", nullptr, sid == SID_Benchmark7_JansUniverse)) - s->onLoad(am, s, sv, SID_Benchmark7_JansUniverse); - if (ImGui::MenuItem("Massive Skinned Animations", nullptr, sid == SID_Benchmark4_SkinnedAnimations)) - s->onLoad(am, s, sv, SID_Benchmark4_SkinnedAnimations); - if (ImGui::MenuItem("Columns without LOD", nullptr, sid == SID_Benchmark5_ColumnsNoLOD)) - s->onLoad(am, s, sv, SID_Benchmark5_ColumnsNoLOD); - if (ImGui::MenuItem("Columns with LOD", nullptr, sid == SID_Benchmark6_ColumnsLOD)) - s->onLoad(am, s, sv, SID_Benchmark6_ColumnsLOD); - if (ImGui::MenuItem("Jan's Universe", nullptr, sid == SID_Benchmark7_JansUniverse)) - s->onLoad(am, s, sv, SID_Benchmark7_JansUniverse); - if (ImGui::MenuItem("Particle System lot of fire complex", nullptr, sid == SID_Benchmark8_ParticleSystemFireComplex)) - s->onLoad(am, s, sv, SID_Benchmark8_ParticleSystemFireComplex); - if (ImGui::MenuItem("Particle System lot of particle", nullptr, sid == SID_Benchmark9_ParticleSystemManyParticles)) - s->onLoad(am, s, sv, SID_Benchmark9_ParticleSystemManyParticles); + if (ImGui::MenuItem("Large Model", nullptr, sid == SID_Benchmark_LargeModel)) + AppCommon::sceneToLoad = SID_Benchmark_LargeModel; + if (ImGui::MenuItem("Massive Nodes", nullptr, sid == SID_Benchmark_LotsOfNodes)) + AppCommon::sceneToLoad = SID_Benchmark_LotsOfNodes; + if (ImGui::MenuItem("Massive Node Animations", nullptr, sid == SID_Benchmark_NodeAnimations)) + AppCommon::sceneToLoad = SID_Benchmark_NodeAnimations; + if (ImGui::MenuItem("Jan's Universe", nullptr, sid == SID_Benchmark_JansUniverse)) + AppCommon::sceneToLoad = SID_Benchmark_JansUniverse; + if (ImGui::MenuItem("Massive Skinned Animations", nullptr, sid == SID_Benchmark_SkinnedAnimations)) + AppCommon::sceneToLoad = SID_Benchmark_SkinnedAnimations; + if (ImGui::MenuItem("Columns without LOD", nullptr, sid == SID_Benchmark_ColumnsNoLOD)) + AppCommon::sceneToLoad = SID_Benchmark_ColumnsNoLOD; + if (ImGui::MenuItem("Columns with LOD", nullptr, sid == SID_Benchmark_ColumnsLOD)) + AppCommon::sceneToLoad = SID_Benchmark_ColumnsLOD; + if (ImGui::MenuItem("Particle System lot of fire complex", nullptr, sid == SID_Benchmark_ParticleSystemComplexFire)) + AppCommon::sceneToLoad = SID_Benchmark_ParticleSystemComplexFire; + if (ImGui::MenuItem("Particle System w. 1 mio. particle", nullptr, sid == SID_ParticleSystem_Many)) + AppCommon::sceneToLoad = SID_ParticleSystem_Many; ImGui::EndMenu(); } @@ -1933,19 +1959,19 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) } if (ImGui::MenuItem("Empty Scene", "Shift-Alt-0", sid == SID_Empty)) - s->onLoad(am, s, sv, SID_Empty); + AppCommon::sceneToLoad = SID_Empty; if (ImGui::MenuItem("Next Scene", - "Shift-Alt-Right", + "Shift-Alt->", nullptr, - AppDemo::sceneID < SID_MaxPublicAssets - 1)) - s->onLoad(am, s, sv, AppDemo::sceneID + 1); + AppCommon::sceneID < SID_MaxPublicAssets - 1)) + AppCommon::sceneToLoad = static_cast(AppCommon::sceneID + 1); if (ImGui::MenuItem("Previous Scene", - "Shift-Alt-Left", + "Shift-Alt-<", nullptr, - AppDemo::sceneID > SID_Empty)) - s->onLoad(am, s, sv, AppDemo::sceneID - 1); + AppCommon::sceneID > SID_Empty)) + AppCommon::sceneToLoad = static_cast(AppCommon::sceneID - 1); #ifndef SL_EMSCRIPTEN ImGui::Separator(); @@ -1958,15 +1984,15 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) PROFILE_SCOPE("Parallel Job 1"); uint maxIter = 100000; - AppDemo::jobProgressMsg("Super long job 1"); - AppDemo::jobProgressMax(100); + AppCommon::jobProgressMsg("Super long job 1"); + AppCommon::jobProgressMax(100); for (uint i = 0; i < maxIter; ++i) { SL_LOG("%u", i); int progressPC = (int)((float)i / (float)maxIter * 100.0f); - AppDemo::jobProgressNum(progressPC); + AppCommon::jobProgressNum(progressPC); } - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; auto job2 = []() @@ -1975,15 +2001,15 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) PROFILE_SCOPE("Parallel Job 2"); uint maxIter = 100000; - AppDemo::jobProgressMsg("Super long job 2"); - AppDemo::jobProgressMax(100); + AppCommon::jobProgressMsg("Super long job 2"); + AppCommon::jobProgressMax(100); for (uint i = 0; i < maxIter; ++i) { SL_LOG("%u", i); int progressPC = (int)((float)i / (float)maxIter * 100.0f); - AppDemo::jobProgressNum(progressPC); + AppCommon::jobProgressNum(progressPC); } - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; auto followUpJob1 = []() @@ -1991,10 +2017,10 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) auto jobToFollow2 = []() { SL_LOG("JobToFollow2"); }; - AppDemo::jobsToBeThreaded.emplace_back(job1); - AppDemo::jobsToBeThreaded.emplace_back(job2); - AppDemo::jobsToFollowInMain.emplace_back(followUpJob1); - AppDemo::jobsToFollowInMain.emplace_back(jobToFollow2); + AppCommon::jobsToBeThreaded.emplace_back(job1); + AppCommon::jobsToBeThreaded.emplace_back(job2); + AppCommon::jobsToFollowInMain.emplace_back(followUpJob1); + AppCommon::jobsToFollowInMain.emplace_back(jobToFollow2); } #endif @@ -2058,10 +2084,8 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) { if (ImGui::MenuItem("Center", nullptr, sv->viewportAlign() == VA_center)) sv->setViewportFromRatio(sv->viewportRatio(), VA_center, sv->viewportSameAsVideo()); - if (ImGui::MenuItem("Left or top", nullptr, sv->viewportAlign() == VA_leftOrTop)) - sv->setViewportFromRatio(sv->viewportRatio(), VA_leftOrTop, sv->viewportSameAsVideo()); - if (ImGui::MenuItem("Right or bottom", nullptr, sv->viewportAlign() == VA_rightOrBottom)) - sv->setViewportFromRatio(sv->viewportRatio(), VA_rightOrBottom, sv->viewportSameAsVideo()); + if (ImGui::MenuItem("Left or bottom", nullptr, sv->viewportAlign() == VA_leftOrBottom)) + sv->setViewportFromRatio(sv->viewportRatio(), VA_leftOrBottom, sv->viewportSameAsVideo()); ImGui::EndMenu(); } @@ -2074,10 +2098,10 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) #if defined(SL_OS_ANDROID) || defined(SL_OS_MACIOS) if (ImGui::BeginMenu("Rotation Sensor")) { - SLDeviceRotation& devRot = AppDemo::devRot; + SLDeviceRotation& devRot = AppCommon::devRot; if (ImGui::MenuItem("Use Device Rotation (IMU)", nullptr, devRot.isUsed())) - devRot.isUsed(!AppDemo::devRot.isUsed()); + devRot.isUsed(!AppCommon::devRot.isUsed()); if (devRot.isUsed()) { @@ -2118,17 +2142,17 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) if (ImGui::BeginMenu("Location Sensor")) { - SLDeviceLocation& devLoc = AppDemo::devLoc; + SLDeviceLocation& devLoc = AppCommon::devLoc; - if (ImGui::MenuItem("Use Device Location (GPS)", nullptr, AppDemo::devLoc.isUsed())) - AppDemo::devLoc.isUsed(!AppDemo::devLoc.isUsed()); + if (ImGui::MenuItem("Use Device Location (GPS)", nullptr, AppCommon::devLoc.isUsed())) + AppCommon::devLoc.isUsed(!AppCommon::devLoc.isUsed()); - if (!AppDemo::devLoc.geoTiffIsAvailableAndValid()) - if (ImGui::MenuItem("Use Origin Altitude", nullptr, AppDemo::devLoc.useOriginAltitude())) - AppDemo::devLoc.useOriginAltitude(!AppDemo::devLoc.useOriginAltitude()); + if (!AppCommon::devLoc.geoTiffIsAvailableAndValid()) + if (ImGui::MenuItem("Use Origin Altitude", nullptr, AppCommon::devLoc.useOriginAltitude())) + AppCommon::devLoc.useOriginAltitude(!AppCommon::devLoc.useOriginAltitude()); if (ImGui::MenuItem("Reset Origin to here")) - AppDemo::devLoc.hasOrigin(false); + AppCommon::devLoc.hasOrigin(false); if (ImGui::BeginMenu("Offset Mode")) { @@ -2191,49 +2215,49 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) { if (ImGui::MenuItem("Start Calibration (Main Camera)")) { - s->onLoad(am, s, sv, SID_VideoCalibrateMain); - showHelpCalibration = false; - showInfosScene = true; + AppCommon::sceneToLoad = SID_VideoCalibrateMain; + showHelpCalibration = false; + showInfosScene = true; } if (ImGui::MenuItem("Start Calibration (Scnd. Camera)", nullptr, false, capture->hasSecondaryCamera)) { - s->onLoad(am, s, sv, SID_VideoCalibrateScnd); - showHelpCalibration = false; - showInfosScene = true; + AppCommon::sceneToLoad = SID_VideoCalibrateScnd; + showHelpCalibration = false; + showInfosScene = true; } if (ImGui::MenuItem("Undistort Image", nullptr, ac->showUndistorted(), ac->calibration.state() == CS_calibrated)) ac->showUndistorted(!ac->showUndistorted()); - if (ImGui::MenuItem("No Tangent Distortion", nullptr, AppDemo::calibrationEstimatorParams.zeroTangentDistortion)) - AppDemo::calibrationEstimatorParams.toggleZeroTangentDist(); + if (ImGui::MenuItem("No Tangent Distortion", nullptr, AppCommon::calibrationEstimatorParams.zeroTangentDistortion)) + AppCommon::calibrationEstimatorParams.toggleZeroTangentDist(); - if (ImGui::MenuItem("Fix Aspect Ratio", nullptr, AppDemo::calibrationEstimatorParams.fixAspectRatio)) - AppDemo::calibrationEstimatorParams.toggleFixAspectRatio(); + if (ImGui::MenuItem("Fix Aspect Ratio", nullptr, AppCommon::calibrationEstimatorParams.fixAspectRatio)) + AppCommon::calibrationEstimatorParams.toggleFixAspectRatio(); - if (ImGui::MenuItem("Fix Principal Point", nullptr, AppDemo::calibrationEstimatorParams.fixPrincipalPoint)) - AppDemo::calibrationEstimatorParams.toggleFixPrincipalPoint(); + if (ImGui::MenuItem("Fix Principal Point", nullptr, AppCommon::calibrationEstimatorParams.fixPrincipalPoint)) + AppCommon::calibrationEstimatorParams.toggleFixPrincipalPoint(); - if (ImGui::MenuItem("Use rational model", nullptr, AppDemo::calibrationEstimatorParams.calibRationalModel)) - AppDemo::calibrationEstimatorParams.toggleRationalModel(); + if (ImGui::MenuItem("Use rational model", nullptr, AppCommon::calibrationEstimatorParams.calibRationalModel)) + AppCommon::calibrationEstimatorParams.toggleRationalModel(); - if (ImGui::MenuItem("Use tilted model", nullptr, AppDemo::calibrationEstimatorParams.calibTiltedModel)) - AppDemo::calibrationEstimatorParams.toggleTiltedModel(); + if (ImGui::MenuItem("Use tilted model", nullptr, AppCommon::calibrationEstimatorParams.calibTiltedModel)) + AppCommon::calibrationEstimatorParams.toggleTiltedModel(); - if (ImGui::MenuItem("Use thin prism model", nullptr, AppDemo::calibrationEstimatorParams.calibThinPrismModel)) - AppDemo::calibrationEstimatorParams.toggleThinPrismModel(); + if (ImGui::MenuItem("Use thin prism model", nullptr, AppCommon::calibrationEstimatorParams.calibThinPrismModel)) + AppCommon::calibrationEstimatorParams.toggleThinPrismModel(); ImGui::EndMenu(); } CVTrackedFeatures* featureTracker = nullptr; - if (tracker != nullptr && typeid(*tracker) == typeid(CVTrackedFeatures)) - featureTracker = (CVTrackedFeatures*)tracker; + if (gVideoTracker != nullptr && typeid(*gVideoTracker) == typeid(CVTrackedFeatures)) + featureTracker = (CVTrackedFeatures*)gVideoTracker; - if (tracker != nullptr) - if (ImGui::MenuItem("Draw Detection", nullptr, tracker->drawDetection())) - tracker->drawDetection(!tracker->drawDetection()); + if (gVideoTracker != nullptr) + if (ImGui::MenuItem("Draw Detection", nullptr, gVideoTracker->drawDetection())) + gVideoTracker->drawDetection(!gVideoTracker->drawDetection()); if (ImGui::BeginMenu("Feature Tracking", featureTracker != nullptr) && featureTracker != nullptr) { @@ -2803,13 +2827,13 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) ImGui::Separator(); - SLVstring animations = s->animManager().allAnimNames(); + SLVstring animations = s->animManager().animationNames(); if (curAnimIx == -1) curAnimIx = 0; - SLAnimPlayback* anim = s->animManager().allAnimPlayback((SLuint)curAnimIx); + SLAnimPlayback* anim = s->animManager().animPlaybackByIndex((SLuint)curAnimIx); ImGui::PushItemWidth(ImGui::GetWindowWidth() * 0.8f); if (myComboBox("##", &curAnimIx, animations)) - anim = s->animManager().allAnimPlayback((SLuint)curAnimIx); + anim = s->animManager().animPlaybackByIndex((SLuint)curAnimIx); ImGui::PopItemWidth(); if (ImGui::MenuItem("Play forward", nullptr, anim->isPlayingForward())) @@ -2886,7 +2910,7 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) ImGui::MenuItem("Stats on Scene", nullptr, &showStatsScene); ImGui::MenuItem("Stats on Video", nullptr, &showStatsVideo); #ifdef SL_BUILD_WAI - if (AppDemo::sceneID == SID_VideoTrackWAI) + if (AppCommon::sceneID == SID_VideoTrackWAI) ImGui::MenuItem("Stats on WAI", nullptr, &showStatsWAI); #endif ImGui::MenuItem("Stats on ImGui", nullptr, &showImGuiMetrics); @@ -2896,15 +2920,15 @@ void AppDemoGui::buildMenuBar(SLScene* s, SLSceneView* sv) ImGui::MenuItem("Scenegraph", nullptr, &showSceneGraph); ImGui::MenuItem("Properties", nullptr, &showProperties); ImGui::MenuItem("Transform", nullptr, &showTransform); - if (AppDemo::devLoc.originLatLonAlt() != SLVec3d::ZERO || - AppDemo::devLoc.defaultLatLonAlt() != SLVec3d::ZERO) + if (AppCommon::devLoc.originLatLonAlt() != SLVec3d::ZERO || + AppCommon::devLoc.defaultLatLonAlt() != SLVec3d::ZERO) ImGui::MenuItem("Date-Time", nullptr, &showDateAndTime); ImGui::MenuItem("UI-Preferences", nullptr, &showUIPrefs); ImGui::Separator(); ImGui::MenuItem("Infos on Device", nullptr, &showInfosDevice); ImGui::MenuItem("Infos on Sensors", nullptr, &showInfosSensors); - if (AppDemo::sceneID >= SID_ErlebARBielBFH && - AppDemo::sceneID <= SID_ErlebARSutzKirchrain18) + if (AppCommon::sceneID >= SID_ErlebAR_BielBFH && + AppCommon::sceneID <= SID_ErlebAR_SutzKirchrain18) { ImGui::Separator(); ImGui::MenuItem("ErlebAR Settings", nullptr, &showErlebAR); @@ -2999,7 +3023,7 @@ void AppDemoGui::buildMenuEdit(SLScene* s, SLSceneView* sv) //! Builds context menu if right mouse click is over non-imgui area void AppDemoGui::buildMenuContext(SLScene* s, SLSceneView* sv) { - assert(s->assetManager() && "No asset manager assigned to scene!"); + // assert(s->assetManager() && "No asset manager assigned to scene!"); if (!ImGui::IsWindowHovered(ImGuiHoveredFlags_AnyWindow) && ImGui::IsMouseReleased(1)) @@ -3050,10 +3074,10 @@ void AppDemoGui::buildSceneGraph(SLScene* s) { PROFILE_FUNCTION(); - assert(s->assetManager() && "No asset manager assigned to scene!"); + // assert(s->assetManager() && "No asset manager assigned to scene!"); ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Scenegraph", &showSceneGraph); + ImGui::Begin("Scenegraph", &showSceneGraph, ImGuiWindowFlags_NoNavInputs); if (s->root3D()) addSceneGraphNode(s, s->root3D()); @@ -3070,7 +3094,7 @@ void AppDemoGui::addSceneGraphNode(SLScene* s, SLNode* node) { PROFILE_FUNCTION(); - assert(s->assetManager() && "No asset manager assigned to scene!"); + // assert(s->assetManager() && "No asset manager assigned to scene!"); SLbool isSelectedNode = s->singleNodeSelected() == node; SLbool isLeafNode = node->children().empty() && !node->mesh(); @@ -3140,14 +3164,14 @@ void AppDemoGui::buildProperties(SLScene* s, SLSceneView* sv) { PROFILE_FUNCTION(); - assert(s->assetManager() && "No asset manager assigned to scene!"); + // assert(s->assetManager() && "No asset manager assigned to scene!"); SLNode* singleNode = s->singleNodeSelected(); SLMesh* singleFullMesh = s->singleMeshFullSelected(); bool partialSelection = !s->selectedMeshes().empty() && !s->selectedMeshes()[0]->IS32.empty(); ImGui::PushFont(ImGui::GetIO().Fonts->Fonts[1]); - ImGui::Begin("Properties", &showProperties, ImGuiWindowFlags_AlwaysVerticalScrollbar); + ImGui::Begin("Properties", &showProperties, ImGuiWindowFlags_AlwaysVerticalScrollbar | ImGuiWindowFlags_NoNavInputs); ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.0f, 1.0f, 1.0f, 1.0f)); if (ImGui::TreeNode("Scene Properties")) @@ -3704,6 +3728,7 @@ void AppDemoGui::buildProperties(SLScene* s, SLSceneView* sv) { SLParticleSystem* ps = dynamic_cast(singleFullMesh); // Need to check if good practice ImGui::PushItemWidth(ImGui::GetWindowWidth() * 0.5f); + int item_current; if (SLGLState::instance()->glHasGeometryShaders()) { @@ -3731,572 +3756,589 @@ void AppDemoGui::buildProperties(SLScene* s, SLSceneView* sv) if (ImGui::Button("Reset")) ps->isGenerated(false); - // Amount - int amount = ps->amount(); - if (ImGui::InputInt("Amount of particles", &amount)) - { - if (amount <= 0) - amount = 1; - ps->amount(amount); - ps->isGenerated(false); - } - - // TTL (Time to live) - if (ImGui::CollapsingHeader("Time to live")) + if (ImGui::CollapsingHeader("Emission")) { ImGui::Indent(); - float timeToLive = ps->timeToLive(); - if (ImGui::InputFloat("Time to live (s)", &timeToLive)) - { - ps->timeToLive(timeToLive); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } - // Counter bug lag/gap - bool doCounterGap = ps->doCounterGap(); - if (ImGui::Checkbox("Counter lag/gap", &doCounterGap)) + + // Amount + int amount = ps->amount(); + if (ImGui::InputInt("Amount of particles", &amount)) { - ps->doCounterGap(doCounterGap); - m->programTF(nullptr); + if (amount <= 0) + amount = 1; + ps->amount(amount); ps->isGenerated(false); } - ImGui::TextWrapped("Need to be enable by default but can create flickering with few particles, recommend to disable if few particles with no velocity "); - ImGui::Unindent(); - } - // Radius - float radiusW = ps->radiusW(); - if (ImGui::InputFloat("Radius width", &radiusW)) - { - ps->radiusW(radiusW); - singleNode->needAABBUpdate(); - } - float radiusH = ps->radiusH(); - if (ImGui::InputFloat("Radius height", &radiusH)) - { - ps->radiusH(radiusH); - singleNode->needAABBUpdate(); - } - - // Scale - float scale = ps->scale(); - if (ImGui::InputFloat("Scale", &scale)) - { - ps->scale(scale); - singleNode->needAABBUpdate(); - } + // TTL (Time to live) + if (ImGui::CollapsingHeader("Time to live")) + { + ImGui::Indent(); - // World space - SLbool doWorldSpace = ps->doWorldSpace(); - if (ImGui::Checkbox("World space", &doWorldSpace)) - ps->doWorldSpace(doWorldSpace); + float timeToLive = ps->timeToLive(); + if (ImGui::InputFloat("Time to live (s)", &timeToLive)) + { + ps->timeToLive(timeToLive); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + // Counter bug lag/gap + bool doCounterGap = ps->doCounterGap(); + if (ImGui::Checkbox("Counter lag/gap", &doCounterGap)) + { + ps->doCounterGap(doCounterGap); + m->programTF(nullptr); + ps->isGenerated(false); + } + ImGui::TextWrapped("Need to be enable by default but can create flickering with few particles, recommend to disable if few particles with no velocity "); - // Gravity - SLbool doGravity = ps->doGravity(); - if (ImGui::Checkbox("Gravity", &doGravity)) - { - ps->doGravity(doGravity); - m->programTF(nullptr); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } - if (ImGui::CollapsingHeader("Gravity", &doGravity)) - { - ImGui::Indent(); - float vec3Gravity[3] = {ps->gravity().x, ps->gravity().y, ps->gravity().z}; - if (ImGui::InputFloat3("Gravity XYZ", vec3Gravity)) - { - ps->gravity(vec3Gravity[0], vec3Gravity[1], vec3Gravity[2]); - singleNode->needAABBUpdate(); + ImGui::Unindent(); } - ImGui::Unindent(); - } - // Billboard - int item_current = ps->billboardType(); - if (ImGui::Combo("Billboard Type", - &item_current, - "Camera Billboard\0Vertical Billboard\0Horizontal Billboard\0")) - { - ps->billboardType((SLBillboardType)item_current); - m->program(nullptr); - if (item_current == 2) + // Billboard + item_current = ps->billboardType(); + if (ImGui::Combo("Billboard Type", + &item_current, + "Camera Billboard\0Vertical Billboard\0Horizontal Billboard\0")) { - if (!sv->drawBits()->get(SL_DB_CULLOFF)) - sv->drawBits()->toggle(SL_DB_CULLOFF); - } - else - { - if (sv->drawBits()->get(SL_DB_CULLOFF)) - sv->drawBits()->toggle(SL_DB_CULLOFF); + ps->billboardType((SLBillboardType)item_current); + m->program(nullptr); + if (item_current == 2) + { + if (!sv->drawBits()->get(SL_DB_CULLOFF)) + sv->drawBits()->toggle(SL_DB_CULLOFF); + } + else + { + if (sv->drawBits()->get(SL_DB_CULLOFF)) + sv->drawBits()->toggle(SL_DB_CULLOFF); + } } - } - // Velocity - if (ps->doDirectionSpeed()) - ImGui::BeginDisabled(); - if (ImGui::CollapsingHeader("Velocity")) - { - ImGui::Indent(); - item_current = ps->velocityType(); - if (ImGui::Combo("Velocity type", &item_current, "Random axes\0Constant axes\0")) + // Shape + SLbool shape_group = ps->doShape(); + if (ImGui::Checkbox("Shape", &shape_group)) { - ps->velocityType(item_current); + ps->doShape(shape_group); + m->programTF(nullptr); ps->isGenerated(false); singleNode->needAABBUpdate(); } - if (item_current == 0) + if (ImGui::CollapsingHeader("Shape", &shape_group)) { - float vec3fVstart[3] = {ps->velocityRndMin().x, ps->velocityRndMin().y, ps->velocityRndMin().z}; - if (ImGui::InputFloat3("Min. random XYZ", vec3fVstart)) + ImGui::Indent(); + item_current = ps->shapeType(); + if (ImGui::Combo("Shape type", + &item_current, + "Sphere\0Box\0Cone\0Pyramid\0")) { - ps->velocityRndMin(vec3fVstart[0], vec3fVstart[1], vec3fVstart[2]); + ps->shapeType((SLShapeType)item_current); + m->programTF(nullptr); ps->isGenerated(false); singleNode->needAABBUpdate(); } - float vec3fVend[3] = {ps->velocityRndMax().x, ps->velocityRndMax().y, ps->velocityRndMax().z}; - if (ImGui::InputFloat3("Max. random XYZ", vec3fVend)) + if (item_current == ST_Sphere) { - ps->velocityRndMax(vec3fVend[0], vec3fVend[1], vec3fVend[2]); - ps->isGenerated(false); - singleNode->needAABBUpdate(); + float radiusSphere = ps->shapeRadius(); + if (ImGui::InputFloat("Radius of the sphere", &radiusSphere)) + { + ps->shapeRadius(radiusSphere); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } } - } - else if (item_current == 1) - { - float vec3fVelocity[3] = {ps->velocityConst().x, ps->velocityConst().y, ps->velocityConst().z}; - if (ImGui::InputFloat3("Constant XYZ", vec3fVelocity)) + if (item_current == ST_Box) + { + float vec3fScaleBox[3] = {ps->shapeScale().x, ps->shapeScale().y, ps->shapeScale().z}; + if (ImGui::InputFloat3("Scale box XYZ", vec3fScaleBox)) + { + ps->shapeScale(vec3fScaleBox[0], vec3fScaleBox[1], vec3fScaleBox[2]); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + if (item_current == ST_Cone) + { + float radius = ps->shapeRadius(); + if (ImGui::InputFloat("Radius", &radius)) + { + ps->shapeRadius(radius); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + float angle = ps->shapeAngle(); + if (ImGui::InputFloat("Angle", &angle)) + { + ps->shapeAngle(angle); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + float height = ps->shapeHeight(); + if (ImGui::InputFloat("Height", &height)) + { + ps->shapeHeight(height); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + if (item_current == ST_Pyramid) { - ps->velocityConst(vec3fVelocity[0], vec3fVelocity[1], vec3fVelocity[2]); + float halfSide = ps->shapeWidth(); + if (ImGui::InputFloat("Half side", &halfSide)) + { + ps->shapeWidth(halfSide); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + float angle = ps->shapeAngle(); + if (ImGui::InputFloat("Angle", &angle)) + { + ps->shapeAngle(angle); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + float height = ps->shapeHeight(); + if (ImGui::InputFloat("Height", &height)) + { + ps->shapeHeight(height); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + // Add surface spawning check box + SLbool shapeSurf = ps->doShapeSurface(); + if (ImGui::Checkbox("Spawn surface", &shapeSurf)) + { + ps->doShapeSurface(shapeSurf); ps->isGenerated(false); - singleNode->needAABBUpdate(); } - } - ImGui::Unindent(); - } - if (ps->doDirectionSpeed()) - ImGui::EndDisabled(); + if (item_current == 2 || item_current == 3) + { + SLbool shapeSpawnBase = ps->doShapeSpawnBase(); + if (ImGui::Checkbox("Spawn base volume", &shapeSpawnBase)) + { + ps->doShapeSpawnBase(shapeSpawnBase); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } - // Direction and speed: Add maybe later mix with velocity - SLbool directionSpeed_group = ps->doDirectionSpeed(); - if (ImGui::Checkbox("Direction and Speed", &directionSpeed_group)) - { - ps->doDirectionSpeed(directionSpeed_group); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } - if (ImGui::CollapsingHeader("Direction and Speed", &directionSpeed_group)) - { - ImGui::Indent(); - float vec3fDirection[3] = {ps->direction().x, ps->direction().y, ps->direction().z}; // Direction - if (ImGui::InputFloat3("Constant XYZ", vec3fDirection)) - { - ps->direction(vec3fDirection[0], vec3fDirection[1], vec3fDirection[2]); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } - // Speed - item_current = ps->doSpeedRange() ? 1 : 0; - if (ImGui::Combo("Speed value", &item_current, "Constant\0Random between two constants\0")) - { - if (item_current == 1) - ps->doSpeedRange(true); - else - ps->doSpeedRange(false); + if (!ps->doDirectionSpeed()) + ImGui::BeginDisabled(); + ImGui::LabelText("Condition", "Need to have direction and speed enabled"); + if (item_current == 2 || item_current == 3) + { + SLbool shapeOverride = ps->doShapeOverride(); + if (ImGui::Checkbox("Follow shape direction (Override direction)", + &shapeOverride)) + { + ps->doShapeOverride(shapeOverride); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + else if (item_current == 0 || item_current == 1) + { + SLbool shapeOverride = ps->doShapeOverride(); + if (ImGui::Checkbox("Inverse center direction (Override direction)", &shapeOverride)) + { + ps->doShapeOverride(shapeOverride); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } - ps->isGenerated(false); - singleNode->needAABBUpdate(); + if (!ps->doDirectionSpeed()) + ImGui::EndDisabled(); + ImGui::Unindent(); } - if (!ps->doSpeedRange()) + + // Flipbook texture + if (ps->texFlipbook()) { - float speed = ps->speed(); - if (ImGui::InputFloat("Constant", &speed)) + SLbool flipbookTex_group = ps->doFlipBookTexture(); + if (ImGui::Checkbox("Flipbook texture", &flipbookTex_group)) { - ps->speed(speed); + ps->doFlipBookTexture(flipbookTex_group); + m->program(nullptr); + m->programTF(nullptr); + ps->changeTexture(); // Switch texture ps->isGenerated(false); - singleNode->needAABBUpdate(); } - } - else - { - float vec2fRange[2] = {ps->speedRange().x, ps->speedRange().y}; - if (ImGui::InputFloat2("Random range Speed", vec2fRange)) + if (ImGui::CollapsingHeader("Flipbook texture", &flipbookTex_group)) { - ps->speedRange(vec2fRange[0], vec2fRange[1]); - ps->isGenerated(false); - singleNode->needAABBUpdate(); + ImGui::Indent(); + int fR = ps->frameRateFB(); + if (ImGui::InputInt("Frame rate (num update by s)", &fR)) + { + ps->frameRateFB(fR); + } + ImGui::Unindent(); } } + ImGui::Unindent(); } - // Color checkbox - SLbool color_group = ps->doColor(); - if (ImGui::Checkbox("Color", &color_group)) - { - ps->doColor(color_group); - m->program(nullptr); - } - if (ImGui::CollapsingHeader("Color", &color_group)) + if (ImGui::CollapsingHeader("Size")) { ImGui::Indent(); - // Color blending brightness/glow - SLbool color_bright = ps->doBlendBrightness(); - if (ImGui::Checkbox("Glow/Bright (blending effect)", &color_bright)) - { - ps->doBlendBrightness(color_bright); - } - // Color - if (ps->doColorOverLT()) - ImGui::BeginDisabled(); - ImGuiColorEditFlags cef = ImGuiColorEditFlags_NoInputs; - SLCol4f c = ps->color(); - if (ImGui::ColorEdit4("Particle color", (float*)&c, cef)) - ps->color(c); - if (ps->doColorOverLT()) - ImGui::EndDisabled(); - - // Color over lifetime - SLbool doColorOverLT_group = ps->doColorOverLT(); - static ImGradient gradient; - static ImGradientMark* draggingMark = nullptr; - static ImGradientMark* selectedMark = nullptr; - - static bool once = [ps]() + // Radius and Scale + float radiusW = ps->radiusW(); + if (ImGui::InputFloat("Radius width", &radiusW)) { - gradient.getMarks().clear(); - for (auto cp : ps->colorPoints()) - gradient.addMark(cp.pos, ImColor(cp.color.r, cp.color.g, cp.color.b)); - return true; - }(); - - if (ImGui::Checkbox("Color over lifetime", &doColorOverLT_group)) + ps->radiusW(radiusW); + singleNode->needAABBUpdate(); + } + float radiusH = ps->radiusH(); + if (ImGui::InputFloat("Radius height", &radiusH)) { - ps->doColorOverLT(doColorOverLT_group); - ps->colorArr(gradient.cachedValues()); - m->program(nullptr); + ps->radiusH(radiusH); + singleNode->needAABBUpdate(); } - - if (ImGui::CollapsingHeader("Color over lifetime", &doColorOverLT_group)) + float scale = ps->scale(); + if (ImGui::InputFloat("Scale", &scale)) { - if (ImGui::GradientEditor(&gradient, draggingMark, selectedMark)) - { - ps->colorPoints().clear(); - for (auto cp : gradient.getMarks()) - ps->colorPoints().push_back(SLColorLUTPoint(SLCol3f(cp->color), cp->position)); - ps->colorArr(gradient.cachedValues()); - } + ps->scale(scale); + singleNode->needAABBUpdate(); } - ImGui::Unindent(); - } - // Rotation - SLbool rot_group = ps->doRotation(); - if (ImGui::Checkbox("Rotation", &rot_group)) - { - ps->doRotation(rot_group); - m->program(nullptr); - m->programTF(nullptr); - ps->isGenerated(false); - } - if (ImGui::CollapsingHeader("Rotation", &rot_group)) - { - ImGui::Indent(); - item_current = ps->doRotRange() ? 1 : 0; - if (ImGui::Combo("Angular velocity value", &item_current, "Constant\0Random between two constants\0")) + // Size over lifetime + SLbool doSizeOverLT_group = ps->doSizeOverLT(); + if (ImGui::Checkbox("Size over lifetime", &doSizeOverLT_group)) { - if (item_current == 1) - ps->doRotRange(true); - else - ps->doRotRange(false); - - m->programTF(nullptr); - ps->isGenerated(false); + ps->doSizeOverLT(doSizeOverLT_group); + m->program(nullptr); + singleNode->needAABBUpdate(); } - if (!ps->doRotRange()) + if (ImGui::CollapsingHeader("Size over lifetime", &doSizeOverLT_group)) { - float angularVelocityConst = ps->angularVelocityConst(); - if (ImGui::InputFloat("Constant", &angularVelocityConst)) + ImGui::Indent(); + SLbool doSizeOverLTCurve_group = ps->doSizeOverLTCurve(); + if (ImGui::Checkbox("Custom curve (Unchecked --> Linear function)2", &doSizeOverLTCurve_group)) { - ps->angularVelocityConst(angularVelocityConst); + ps->doSizeOverLTCurve(doSizeOverLTCurve_group); + m->program(nullptr); } - } - else - { - float vec2fRange[2] = {ps->angularVelocityRange().x, ps->angularVelocityRange().y}; - if (ImGui::InputFloat2("Random range A.V", vec2fRange)) + if (ImGui::CollapsingHeader("Bezier curve size", &doSizeOverLTCurve_group)) { - ps->angularVelocityRange(vec2fRange[0], vec2fRange[1]); - ps->isGenerated(false); + ImGui::Indent(); + float* vSize = ps->bezierControlPointSize(); + float* staEndSize = ps->bezierStartEndPointSize(); + if (ImGui::Bezier("easeInExpo", vSize, staEndSize)) + ps->generateBernsteinPSize(); + ImGui::Unindent(); } + ImGui::Unindent(); } + ImGui::Unindent(); } - // Shape - SLbool shape_group = ps->doShape(); - if (ImGui::Checkbox("Shape", &shape_group)) - { - ps->doShape(shape_group); - m->programTF(nullptr); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } - if (ImGui::CollapsingHeader("Shape", &shape_group)) + if (ImGui::CollapsingHeader("Movement")) { ImGui::Indent(); - item_current = ps->shapeType(); - if (ImGui::Combo("Shape type", &item_current, "Sphere\0Box\0Cone\0Pyramid\0")) + + // World space + SLbool doWorldSpace = ps->doWorldSpace(); + if (ImGui::Checkbox("World space", &doWorldSpace)) + ps->doWorldSpace(doWorldSpace); + + // Gravity + SLbool doGravity = ps->doGravity(); + if (ImGui::Checkbox("Gravity", &doGravity)) { - ps->shapeType((SLShapeType)item_current); + ps->doGravity(doGravity); m->programTF(nullptr); ps->isGenerated(false); singleNode->needAABBUpdate(); } - if (item_current == ST_Sphere) + if (ImGui::CollapsingHeader("Gravity", &doGravity)) { - float radiusSphere = ps->shapeRadius(); - if (ImGui::InputFloat("Radius of the sphere", &radiusSphere)) + ImGui::Indent(); + float vec3Gravity[3] = {ps->gravity().x, ps->gravity().y, ps->gravity().z}; + if (ImGui::InputFloat3("Gravity XYZ", vec3Gravity)) { - ps->shapeRadius(radiusSphere); - ps->isGenerated(false); + ps->gravity(vec3Gravity[0], vec3Gravity[1], vec3Gravity[2]); singleNode->needAABBUpdate(); } + ImGui::Unindent(); } - if (item_current == ST_Box) + + // Acceleration + SLbool acc_group = ps->doAcc(); + if (ImGui::Checkbox("Acceleration", &acc_group)) { - float vec3fScaleBox[3] = {ps->shapeScale().x, ps->shapeScale().y, ps->shapeScale().z}; - if (ImGui::InputFloat3("Scale box XYZ", vec3fScaleBox)) - { - ps->shapeScale(vec3fScaleBox[0], vec3fScaleBox[1], vec3fScaleBox[2]); - ps->isGenerated(false); - singleNode->needAABBUpdate(); - } + ps->doAcceleration(acc_group); + m->programTF(nullptr); + singleNode->needAABBUpdate(); + ps->isGenerated(false); } - if (item_current == ST_Cone) + if (ImGui::CollapsingHeader("Acceleration", &acc_group)) { - float radius = ps->shapeRadius(); - if (ImGui::InputFloat("Radius", &radius)) + ImGui::Indent(); + if (ps->doAccDiffDir()) + ImGui::BeginDisabled(); + float accConst = ps->accelerationConst(); + if (ImGui::InputFloat("Accelaration constant", &accConst)) { - ps->shapeRadius(radius); - ps->isGenerated(false); + ps->accConst(accConst); singleNode->needAABBUpdate(); } - float angle = ps->shapeAngle(); - if (ImGui::InputFloat("Angle", &angle)) + if (ps->doAccDiffDir()) + ImGui::EndDisabled(); + SLbool accDiffDirection_group = ps->doAccDiffDir(); + if (ImGui::Checkbox("Direction vector", &accDiffDirection_group)) { - ps->shapeAngle(angle); - ps->isGenerated(false); + ps->doAccDiffDir(accDiffDirection_group); + m->programTF(nullptr); singleNode->needAABBUpdate(); } - float height = ps->shapeHeight(); - if (ImGui::InputFloat("Height", &height)) + if (ImGui::CollapsingHeader("Direction vector", &accDiffDirection_group)) { - ps->shapeHeight(height); - ps->isGenerated(false); + float vec3fAcc[3] = {ps->acceleration().x, ps->acceleration().y, ps->acceleration().z}; + ImGui::InputFloat3("input float3", vec3fAcc); + ps->acceleration(vec3fAcc[0], vec3fAcc[1], vec3fAcc[2]); singleNode->needAABBUpdate(); } + ImGui::Unindent(); } - if (item_current == ST_Pyramid) + + // Velocity + if (ps->doDirectionSpeed()) + ImGui::BeginDisabled(); + if (ImGui::CollapsingHeader("Velocity")) { - float halfSide = ps->shapeWidth(); - if (ImGui::InputFloat("Half side", &halfSide)) + ImGui::Indent(); + item_current = ps->velocityType(); + if (ImGui::Combo("Velocity type", &item_current, "Random axes\0Constant axes\0")) { - ps->shapeWidth(halfSide); + ps->velocityType(item_current); ps->isGenerated(false); singleNode->needAABBUpdate(); } - float angle = ps->shapeAngle(); - if (ImGui::InputFloat("Angle", &angle)) + if (item_current == 0) { - ps->shapeAngle(angle); - ps->isGenerated(false); - singleNode->needAABBUpdate(); + float vec3fVstart[3] = {ps->velocityRndMin().x, ps->velocityRndMin().y, ps->velocityRndMin().z}; + if (ImGui::InputFloat3("Min. random XYZ", vec3fVstart)) + { + ps->velocityRndMin(vec3fVstart[0], vec3fVstart[1], vec3fVstart[2]); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + float vec3fVend[3] = {ps->velocityRndMax().x, ps->velocityRndMax().y, ps->velocityRndMax().z}; + if (ImGui::InputFloat3("Max. random XYZ", vec3fVend)) + { + ps->velocityRndMax(vec3fVend[0], vec3fVend[1], vec3fVend[2]); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } } - float height = ps->shapeHeight(); - if (ImGui::InputFloat("Height", &height)) + else if (item_current == 1) { - ps->shapeHeight(height); - ps->isGenerated(false); - singleNode->needAABBUpdate(); + float vec3fVelocity[3] = {ps->velocityConst().x, ps->velocityConst().y, ps->velocityConst().z}; + if (ImGui::InputFloat3("Constant XYZ", vec3fVelocity)) + { + ps->velocityConst(vec3fVelocity[0], vec3fVelocity[1], vec3fVelocity[2]); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } } + ImGui::Unindent(); } - // Add surface spawning check box - SLbool shapeSurf = ps->doShapeSurface(); - if (ImGui::Checkbox("Spawn surface", &shapeSurf)) + if (ps->doDirectionSpeed()) + ImGui::EndDisabled(); + + // Direction and speed: Add maybe later mix with velocity + SLbool directionSpeed_group = ps->doDirectionSpeed(); + if (ImGui::Checkbox("Direction and Speed", &directionSpeed_group)) { - ps->doShapeSurface(shapeSurf); + ps->doDirectionSpeed(directionSpeed_group); ps->isGenerated(false); + singleNode->needAABBUpdate(); } - if (item_current == 2 || item_current == 3) + + if (ImGui::CollapsingHeader("Direction and Speed", &directionSpeed_group)) { - SLbool shapeSpawnBase = ps->doShapeSpawnBase(); - if (ImGui::Checkbox("Spawn base volume", &shapeSpawnBase)) + ImGui::Indent(); + float vec3fDirection[3] = {ps->direction().x, ps->direction().y, ps->direction().z}; // Direction + if (ImGui::InputFloat3("Constant XYZ", vec3fDirection)) { - ps->doShapeSpawnBase(shapeSpawnBase); + ps->direction(vec3fDirection[0], vec3fDirection[1], vec3fDirection[2]); ps->isGenerated(false); singleNode->needAABBUpdate(); } - } - - if (!ps->doDirectionSpeed()) - ImGui::BeginDisabled(); - ImGui::LabelText("Condition", "Need to have direction and speed enabled"); - if (item_current == 2 || item_current == 3) - { - SLbool shapeOverride = ps->doShapeOverride(); - if (ImGui::Checkbox("Follow shape direction (Override direction)", &shapeOverride)) + // Speed + item_current = ps->doSpeedRange() ? 1 : 0; + if (ImGui::Combo("Speed value", + &item_current, + "Constant\0Random between two constants\0")) { - ps->doShapeOverride(shapeOverride); + if (item_current == 1) + ps->doSpeedRange(true); + else + ps->doSpeedRange(false); + ps->isGenerated(false); singleNode->needAABBUpdate(); } - } - else if (item_current == 0 || item_current == 1) - { - SLbool shapeOverride = ps->doShapeOverride(); - if (ImGui::Checkbox("Inverse center direction (Override direction)", &shapeOverride)) + if (!ps->doSpeedRange()) + { + float speed = ps->speed(); + if (ImGui::InputFloat("Constant", &speed)) + { + ps->speed(speed); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + else + { + float vec2fRange[2] = {ps->speedRange().x, ps->speedRange().y}; + if (ImGui::InputFloat2("Random range Speed", vec2fRange)) + { + ps->speedRange(vec2fRange[0], vec2fRange[1]); + ps->isGenerated(false); + singleNode->needAABBUpdate(); + } + } + + // Rotation + SLbool rot_group = ps->doRotation(); + if (ImGui::Checkbox("Rotation", &rot_group)) { - ps->doShapeOverride(shapeOverride); + ps->doRotation(rot_group); + m->program(nullptr); + m->programTF(nullptr); ps->isGenerated(false); - singleNode->needAABBUpdate(); } - } + if (ImGui::CollapsingHeader("Rotation", &rot_group)) + { + ImGui::Indent(); + item_current = ps->doRotRange() ? 1 : 0; + if (ImGui::Combo("Angular velocity value", &item_current, "Constant\0Random between two constants\0")) + { + if (item_current == 1) + ps->doRotRange(true); + else + ps->doRotRange(false); - if (!ps->doDirectionSpeed()) - ImGui::EndDisabled(); - ImGui::Unindent(); - } + m->programTF(nullptr); + ps->isGenerated(false); + } + if (!ps->doRotRange()) + { + float angularVelocityConst = ps->angularVelocityConst(); + if (ImGui::InputFloat("Constant", &angularVelocityConst)) + { + ps->angularVelocityConst(angularVelocityConst); + } + } + else + { + float vec2fRange[2] = {ps->angularVelocityRange().x, ps->angularVelocityRange().y}; + if (ImGui::InputFloat2("Random range A.V", vec2fRange)) + { + ps->angularVelocityRange(vec2fRange[0], vec2fRange[1]); + ps->isGenerated(false); + } + } + ImGui::Unindent(); + } - // Acceleration - SLbool acc_group = ps->doAcc(); - if (ImGui::Checkbox("Acceleration", &acc_group)) - { - ps->doAcceleration(acc_group); - m->programTF(nullptr); - singleNode->needAABBUpdate(); - ps->isGenerated(false); - } - if (ImGui::CollapsingHeader("Acceleration", &acc_group)) - { - ImGui::Indent(); - if (ps->doAccDiffDir()) - ImGui::BeginDisabled(); - float accConst = ps->accelerationConst(); - if (ImGui::InputFloat("Accelaration constant", &accConst)) - { - ps->accConst(accConst); - singleNode->needAABBUpdate(); - } - if (ps->doAccDiffDir()) - ImGui::EndDisabled(); - SLbool accDiffDirection_group = ps->doAccDiffDir(); - if (ImGui::Checkbox("Direction vector", &accDiffDirection_group)) - { - ps->doAccDiffDir(accDiffDirection_group); - m->programTF(nullptr); - singleNode->needAABBUpdate(); - } - if (ImGui::CollapsingHeader("Direction vector", &accDiffDirection_group)) - { - float vec3fAcc[3] = {ps->acceleration().x, ps->acceleration().y, ps->acceleration().z}; - ImGui::InputFloat3("input float3", vec3fAcc); - ps->acceleration(vec3fAcc[0], vec3fAcc[1], vec3fAcc[2]); - singleNode->needAABBUpdate(); + ImGui::Unindent(); } + ImGui::Unindent(); } - // Alpha over lifetime - SLbool doAlphaOverL_group = ps->doAlphaOverLT(); - if (ImGui::Checkbox("Alpha over lifetime", &doAlphaOverL_group)) - { - ps->doAlphaOverLT(doAlphaOverL_group); - m->program(nullptr); - } - if (ImGui::CollapsingHeader("Alpha over lifetime", &doAlphaOverL_group)) + if (ImGui::CollapsingHeader("Color")) { ImGui::Indent(); - SLbool doAlphaOverLCurve_group = ps->doAlphaOverLTCurve(); - if (ImGui::Checkbox("Custom curve (Unchecked --> Linear function)", &doAlphaOverLCurve_group)) + + // Color checkbox + SLbool color_group = ps->doColor(); + if (ImGui::Checkbox("Color", &color_group)) { - ps->doAlphaOverLTCurve(doAlphaOverLCurve_group); + ps->doColor(color_group); m->program(nullptr); } - if (ImGui::CollapsingHeader("Bezier curve alpha", &doAlphaOverLCurve_group)) + if (ImGui::CollapsingHeader("Color", &color_group)) { ImGui::Indent(); - float* vAlpha = ps->bezierControlPointAlpha(); - float* staEndAlpha = ps->bezierStartEndPointAlpha(); - if (ImGui::Bezier("easeInExpo", vAlpha, staEndAlpha)) - ps->generateBernsteinPAlpha(); + // Color blending brightness/glow + SLbool color_bright = ps->doBlendBrightness(); + if (ImGui::Checkbox("Glow/Bright (blending effect)", &color_bright)) + { + ps->doBlendBrightness(color_bright); + } + + // Color + if (ps->doColorOverLT()) + ImGui::BeginDisabled(); + ImGuiColorEditFlags cef = ImGuiColorEditFlags_NoInputs; + SLCol4f c = ps->color(); + if (ImGui::ColorEdit4("Particle color", (float*)&c, cef)) + ps->color(c); + if (ps->doColorOverLT()) + ImGui::EndDisabled(); + + // Color over lifetime + SLbool doColorOverLT_group = ps->doColorOverLT(); + + if (ImGui::Checkbox("Color over lifetime", &doColorOverLT_group)) + { + ps->doColorOverLT(doColorOverLT_group); + //ps->colorArr(gradient.cachedValues()); + m->program(nullptr); + } + + if (ImGui::CollapsingHeader("Color over lifetime", &doColorOverLT_group)) + { + ImGui::Text("Edit gradient colors in the texture section."); + } ImGui::Unindent(); } - ImGui::Unindent(); - } - // Size over lifetime - SLbool doSizeOverLT_group = ps->doSizeOverLT(); - if (ImGui::Checkbox("Size over lifetime", &doSizeOverLT_group)) - { - ps->doSizeOverLT(doSizeOverLT_group); - m->program(nullptr); - singleNode->needAABBUpdate(); - } - if (ImGui::CollapsingHeader("Size over lifetime", &doSizeOverLT_group)) - { - ImGui::Indent(); - SLbool doSizeOverLTCurve_group = ps->doSizeOverLTCurve(); - if (ImGui::Checkbox("Custom curve (Unchecked --> Linear function)2", &doSizeOverLTCurve_group)) + // Alpha over lifetime + SLbool doAlphaOverL_group = ps->doAlphaOverLT(); + if (ImGui::Checkbox("Alpha over lifetime", &doAlphaOverL_group)) { - ps->doSizeOverLTCurve(doSizeOverLTCurve_group); + ps->doAlphaOverLT(doAlphaOverL_group); m->program(nullptr); } - if (ImGui::CollapsingHeader("Bezier curve size", &doSizeOverLTCurve_group)) + if (ImGui::CollapsingHeader("Alpha over lifetime", &doAlphaOverL_group)) { ImGui::Indent(); - float* vSize = ps->bezierControlPointSize(); - float* staEndSize = ps->bezierStartEndPointSize(); - if (ImGui::Bezier("easeInExpo", vSize, staEndSize)) - ps->generateBernsteinPSize(); + SLbool doAlphaOverLCurve_group = ps->doAlphaOverLTCurve(); + if (ImGui::Checkbox("Custom curve (Unchecked --> Linear function)", &doAlphaOverLCurve_group)) + { + ps->doAlphaOverLTCurve(doAlphaOverLCurve_group); + m->program(nullptr); + } + if (ImGui::CollapsingHeader("Bezier curve alpha", &doAlphaOverLCurve_group)) + { + ImGui::Indent(); + float* vAlpha = ps->bezierControlPointAlpha(); + float* staEndAlpha = ps->bezierStartEndPointAlpha(); + if (ImGui::Bezier("easeInExpo", vAlpha, staEndAlpha)) + ps->generateBernsteinPAlpha(); + ImGui::Unindent(); + } ImGui::Unindent(); } - ImGui::Unindent(); - } - // Flipbook texture - if (ps->textureFlipbook() == nullptr) - ImGui::BeginDisabled(); - SLbool flipbookTex_group = ps->doFlipBookTexture(); - if (ImGui::Checkbox("Flipbook texture", &flipbookTex_group)) - { - ps->doFlipBookTexture(flipbookTex_group); - m->program(nullptr); - m->programTF(nullptr); - ps->changeTexture(); // Switch texture - ps->isGenerated(false); - } - if (ImGui::CollapsingHeader("Flipbook texture", &flipbookTex_group)) - { - ImGui::Indent(); - int fR = ps->frameRateFB(); - if (ImGui::InputInt("Frame rate (num update by s)", &fR)) - { - ps->frameRateFB(fR); - } ImGui::Unindent(); } - if (ps->textureFlipbook() == nullptr) - ImGui::EndDisabled(); ImGui::PopItemWidth(); ImGui::TreePop(); } } - if (m->numTextures() > 0 && - ImGui::TreeNode("Tex", "Textures (%d)", m->numTextures())) + // Textures + if (m->numTextures() > 0 && ImGui::TreeNode("Tex", "Textures (%d)", m->numTextures())) { for (int tt = 0; tt < TT_numTextureType; ++tt) for (auto& tex : m->textures((SLTextureType)tt)) @@ -4305,39 +4347,48 @@ void AppDemoGui::buildProperties(SLScene* s, SLSceneView* sv) ImGui::TreePop(); } - if (m->program() != nullptr) + // Shaders + size_t numShaders = m->program() ? m->program()->shaders().size() : 0; + numShaders += m->programTF() ? m->programTF()->shaders().size() : 0; + + if (numShaders > 0 && ImGui::TreeNode("Shd", "Shaders (%d)", (int)numShaders)) { - for (auto* shd : m->program()->shaders()) + if (m->program() != nullptr) { - if (ImGui::TreeNode(shd->name().c_str())) + for (auto* shd : m->program()->shaders()) { - SLchar* text = new char[shd->code().length() + 1]; - strcpy(text, shd->code().c_str()); - ImGui::InputTextMultiline(shd->name().c_str(), - text, - shd->code().length() + 1, - ImVec2(-1.0f, -1.0f)); - ImGui::TreePop(); - delete[] text; + if (ImGui::TreeNode(shd->name().c_str())) + { + SLchar* text = new char[shd->code().length() + 1]; + strcpy(text, shd->code().c_str()); + ImGui::InputTextMultiline(shd->name().c_str(), + text, + shd->code().length() + 1, + ImVec2(-1.0f, -1.0f)); + ImGui::TreePop(); + delete[] text; + } } } - } - if (m->programTF() != nullptr) - { - for (auto* shd : m->programTF()->shaders()) + if (m->programTF() != nullptr) { - if (ImGui::TreeNode(shd->name().c_str())) + for (auto* shd : m->programTF()->shaders()) { - SLchar* text = new char[shd->code().length() + 1]; - strcpy(text, shd->code().c_str()); - ImGui::InputTextMultiline(shd->name().c_str(), - text, - shd->code().length() + 1, - ImVec2(-1.0f, -1.0f)); - ImGui::TreePop(); - delete[] text; + if (ImGui::TreeNode(shd->name().c_str())) + { + SLchar* text = new char[shd->code().length() + 1]; + strcpy(text, shd->code().c_str()); + ImGui::InputTextMultiline(shd->name().c_str(), + text, + shd->code().length() + 1, + ImVec2(-1.0f, -1.0f)); + ImGui::TreePop(); + delete[] text; + } } } + + ImGui::TreePop(); } ImGui::TreePop(); @@ -4353,7 +4404,7 @@ void AppDemoGui::buildProperties(SLScene* s, SLSceneView* sv) else if (!singleFullMesh && !s->selectedMeshes().empty()) { // See also SLMesh::handleRectangleSelection - ImGui::Begin("Properties of Selection", &showProperties, ImGuiWindowFlags_AlwaysVerticalScrollbar); + ImGui::Begin("Properties of Selection", &showProperties, ImGuiWindowFlags_AlwaysVerticalScrollbar | ImGuiWindowFlags_NoNavInputs); for (auto* selectedNode : s->selectedNodes()) { @@ -4451,13 +4502,13 @@ void AppDemoGui::showTexInfos(SLGLTexture* tex) if (typeid(*tex) == typeid(SLTexColorLUT)) { SLTexColorLUT* lut = (SLTexColorLUT*)tex; - if (ImGui::TreeNode("Color Points in Transfer Function")) + if (ImGui::TreeNode("Color Points in Gradient")) { showLUTColors(lut); ImGui::TreePop(); } - if (ImGui::TreeNode("Alpha Points in Transfer Function")) + if (ImGui::TreeNode("Alpha Points in Gradient")) { for (SLulong a = 0; a < lut->alphas().size(); ++a) { @@ -4537,20 +4588,20 @@ void AppDemoGui::showTexInfos(SLGLTexture* tex) void AppDemoGui::loadConfig(SLint dotsPerInch) { ImGuiStyle& style = ImGui::GetStyle(); - SLstring fullPathAndFilename = AppDemo::configPath + - AppDemo::name + ".yml"; + SLstring fullPathAndFilename = AppCommon::configPath + + AppCommon::name + ".yml"; if (!SLFileStorage::exists(fullPathAndFilename, IOK_config)) { SL_LOG("No config file %s: ", fullPathAndFilename.c_str()); // Scale for proportional and fixed size fonts - SLfloat dpiScaleProp = (float)dotsPerInch / 120.0f; + SLfloat dpiScaleProp = (float)dotsPerInch / 142.0f; SLfloat dpiScaleFixed = (float)dotsPerInch / 142.0f; // Default settings for the first time - SLGLImGui::fontPropDots = std::max(16.0f * dpiScaleProp, 16.0f); - SLGLImGui::fontFixedDots = std::max(13.0f * dpiScaleFixed, 13.0f); + SLImGui::fontPropDots = std::max(16.0f * dpiScaleProp, 16.0f); + SLImGui::fontFixedDots = std::max(13.0f * dpiScaleFixed, 13.0f); // Store dialog show states AppDemoGui::showAbout = true; @@ -4590,8 +4641,8 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) SLint i = 0; SLbool b = false; fs["configTime"] >> AppDemoGui::configTime; - fs["fontPropDots"] >> i; SLGLImGui::fontPropDots = (SLfloat) i; - fs["fontFixedDots"] >> i; SLGLImGui::fontFixedDots = (SLfloat) i; + fs["fontPropDots"] >> i; SLImGui::fontPropDots = (SLfloat) i; + fs["fontFixedDots"] >> i; SLImGui::fontFixedDots = (SLfloat) i; fs["ItemSpacingX"] >> i; style.ItemSpacing.x = (SLfloat) i; fs["ItemSpacingY"] >> i; style.ItemSpacing.y = (SLfloat) i; style.WindowPadding.x = style.FramePadding.x = style.ItemInnerSpacing.x = style.ItemSpacing.x; @@ -4603,7 +4654,7 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) style.ScrollbarSize = 16.0f; fs["ScrollbarRounding"] >> i; style.ScrollbarRounding = (SLfloat) i; - fs["sceneID"] >> i; AppDemo::sceneID = (SLSceneID) i; + fs["sceneID"] >> i; AppCommon::sceneID = (SLSceneID) i; fs["showInfosScene"] >> b; AppDemoGui::showInfosScene = b; fs["showStatsTiming"] >> b; AppDemoGui::showStatsTiming = b; fs["showStatsMemory"] >> b; AppDemoGui::showStatsScene = b; @@ -4623,8 +4674,8 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) fs.release(); SL_LOG("Config. loaded : %s", fullPathAndFilename.c_str()); SL_LOG("Config. date : %s", AppDemoGui::configTime.c_str()); - SL_LOG("fontPropDots : %f", SLGLImGui::fontPropDots); - SL_LOG("fontFixedDots : %f", SLGLImGui::fontFixedDots); + SL_LOG("fontPropDots : %f", SLImGui::fontPropDots); + SL_LOG("fontFixedDots : %f", SLImGui::fontFixedDots); } else { @@ -4639,16 +4690,16 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) // check font sizes for HDPI displays if (dotsPerInch > 300) { - if (SLGLImGui::fontPropDots < 16.1f && - SLGLImGui::fontFixedDots < 13.1) + if (SLImGui::fontPropDots < 16.1f && + SLImGui::fontFixedDots < 13.1) { // Scale for proportional and fixed size fonts SLfloat dpiScaleProp = (float)dotsPerInch / 120.0f; SLfloat dpiScaleFixed = (float)dotsPerInch / 142.0f; // Default settings for the first time - SLGLImGui::fontPropDots = std::max(16.0f * dpiScaleProp, 16.0f); - SLGLImGui::fontFixedDots = std::max(13.0f * dpiScaleFixed, 13.0f); + SLImGui::fontPropDots = std::max(16.0f * dpiScaleProp, 16.0f); + SLImGui::fontFixedDots = std::max(13.0f * dpiScaleFixed, 13.0f); } } } @@ -4656,14 +4707,14 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) #ifdef SL_EMSCRIPTEN // Overwrite config with URL parameters // clang-format off - int sceneId = MAIN_THREAD_EM_ASM_INT( + int sceneId = EM_ASM_INT( let params = new URL(window.location).searchParams; return params.get("scene") ?? -1; ); // clang-format on if (sceneId != -1) - AppDemo::sceneID = (SLSceneID)sceneId; + AppCommon::sceneID = (SLSceneID)sceneId; #endif } //----------------------------------------------------------------------------- @@ -4671,8 +4722,8 @@ void AppDemoGui::loadConfig(SLint dotsPerInch) void AppDemoGui::saveConfig() { ImGuiStyle& style = ImGui::GetStyle(); - SLstring fullPathAndFilename = AppDemo::configPath + - AppDemo::name + ".yml"; + SLstring fullPathAndFilename = AppCommon::configPath + + AppCommon::name + ".yml"; if (!SLFileStorage::exists(fullPathAndFilename, IOK_config)) SL_LOG("New config file will be written: %s", @@ -4689,13 +4740,13 @@ void AppDemoGui::saveConfig() } fs << "configTime" << Utils::getLocalTimeString(); - fs << "fontPropDots" << (SLint)SLGLImGui::fontPropDots; - fs << "fontFixedDots" << (SLint)SLGLImGui::fontFixedDots; - if (AppDemo::sceneID == SID_VolumeRayCastLighted || - AppDemo::sceneID == SID_VolumeRayCast) + fs << "fontPropDots" << (SLint)SLImGui::fontPropDots; + fs << "fontFixedDots" << (SLint)SLImGui::fontFixedDots; + if (AppCommon::sceneID == SID_VolumeRayCastLighted || + AppCommon::sceneID == SID_VolumeRayCast) fs << "sceneID" << (SLint)SID_Minimal; else - fs << "sceneID" << (SLint)AppDemo::sceneID; + fs << "sceneID" << (SLint)AppCommon::sceneID; fs << "ItemSpacingX" << (SLint)style.ItemSpacing.x; fs << "ItemSpacingY" << (SLint)style.ItemSpacing.y; fs << "ScrollbarSize" << (SLfloat)style.ScrollbarSize; @@ -4719,7 +4770,7 @@ void AppDemoGui::saveConfig() SLFileStorage::writeString(fullPathAndFilename, IOK_config, configString); - SL_LOG("Config. saved : %s", fullPathAndFilename.c_str()); + SL_LOG("Config. saved : %s", fullPathAndFilename.c_str()); } //----------------------------------------------------------------------------- //! Adds a transform node for the selected node and toggles the edit mode @@ -4733,7 +4784,7 @@ void AppDemoGui::setTransformEditMode(SLScene* s, { tN = new SLTransformNode(sv, s->singleNodeSelected(), - AppDemo::shaderPath); + AppCommon::shaderPath); s->root3D()->addChild(tN); } @@ -4780,9 +4831,9 @@ void AppDemoGui::showHorizon(SLScene* s, SLSceneView* sv) if (!horizonNode) { horizonNode = new SLHorizonNode(horizonName, - &AppDemo::devRot, + &AppCommon::devRot, am->font16, - AppDemo::shaderPath, + AppCommon::shaderPath, sv->scrW(), sv->scrH()); s->root2D()->addChild(horizonNode); @@ -4846,15 +4897,15 @@ void AppDemoGui::loadSceneWithLargeModel(SLScene* s, SLSceneID sceneIDToLoad) { SLstring pathSrc = "https://pallas.ti.bfh.ch/data/SLProject/models/"; - SLstring pathDst = AppDemo::configPath + "models/"; + SLstring pathDst = AppCommon::configPath + "models/"; #ifndef SL_EMSCRIPTEN if (Utils::fileExists(filenameToLoad)) - s->onLoad(s->assetManager(), s, sv, sceneIDToLoad); + AppCommon::sceneToLoad = sceneIDToLoad; else downloadModelAndLoadScene(s, sv, downloadFilename, pathSrc, pathDst, filenameToLoad, sceneIDToLoad); #else - s->onLoad(s->assetManager(), s, sv, sceneIDToLoad); + AppCommon::sceneToLoad = sceneIDToLoad; #endif } //----------------------------------------------------------------------------- @@ -4876,7 +4927,7 @@ void AppDemoGui::downloadModelAndLoadScene(SLScene* s, if (filesize > 0) { int transferredPC = (int)((float)curr / (float)filesize * 100.0f); - AppDemo::jobProgressNum(transferredPC); + AppCommon::jobProgressNum(transferredPC); } else cout << "Bytes transferred: " << curr << endl; @@ -4888,22 +4939,22 @@ void AppDemoGui::downloadModelAndLoadScene(SLScene* s, { PROFILE_FUNCTION(); string jobMsg = "Downloading file via HTTPS: " + downloadFilename; - AppDemo::jobProgressMsg(jobMsg); - AppDemo::jobProgressMax(100); + AppCommon::jobProgressMsg(jobMsg); + AppCommon::jobProgressMax(100); string fileToDownload = urlFolder + downloadFilename; if (HttpUtils::download(fileToDownload, dstFolder, progressCallback) != 0) { SL_LOG("*** Nothing downloaded from: %s ***", fileToDownload.c_str()); SL_LOG("*** PLEASE RETRY DOWNLOAD ***", fileToDownload.c_str()); } - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; auto unzipJob = [=]() { string jobMsg = "Decompressing file: " + downloadFilename; - AppDemo::jobProgressMsg(jobMsg); - AppDemo::jobProgressMax(-1); + AppCommon::jobProgressMsg(jobMsg); + AppCommon::jobProgressMax(-1); string zipFile = dstFolder + downloadFilename; if (Utils::fileExists(zipFile)) { @@ -4917,21 +4968,21 @@ void AppDemoGui::downloadModelAndLoadScene(SLScene* s, else SL_LOG("*** File do decompress doesn't exist: %s ***", zipFile.c_str()); - AppDemo::jobIsRunning = false; + AppCommon::jobIsRunning = false; }; auto followUpJob1 = [=]() { if (Utils::fileExists(pathAndFileToLoad)) - s->onLoad(am, s, sv, sceneIDToLoad); + AppCommon::sceneToLoad = sceneIDToLoad; else SL_LOG("*** File do load doesn't exist: %s ***", pathAndFileToLoad.c_str()); }; - AppDemo::jobsToBeThreaded.emplace_back(downloadJobHTTP); - AppDemo::jobsToBeThreaded.emplace_back(unzipJob); - AppDemo::jobsToFollowInMain.push_back(followUpJob1); + AppCommon::jobsToBeThreaded.emplace_back(downloadJobHTTP); + AppCommon::jobsToBeThreaded.emplace_back(unzipJob); + AppCommon::jobsToFollowInMain.push_back(followUpJob1); #endif } //----------------------------------------------------------------------------- @@ -4940,10 +4991,10 @@ void AppDemoGui::setActiveNamedLocation(int locIndex, SLSceneView* sv, SLVec3f lookAtPoint) { - AppDemo::devLoc.activeNamedLocation(locIndex); + AppCommon::devLoc.activeNamedLocation(locIndex); #if !defined(SL_OS_MACIOS) && !defined(SL_OS_ANDROID) - SLVec3d pos_d = AppDemo::devLoc.defaultENU() - AppDemo::devLoc.originENU(); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); SLCamera* cam = sv->camera(); cam->translation(pos_f); diff --git a/apps/app_demo_slproject/include/AppDemoGui.h b/apps/app_demo/source/AppDemoGui.h similarity index 90% rename from apps/app_demo_slproject/include/AppDemoGui.h rename to apps/app_demo/source/AppDemoGui.h index 3d4f6cd9..7320dbf6 100644 --- a/apps/app_demo_slproject/include/AppDemoGui.h +++ b/apps/app_demo/source/AppDemoGui.h @@ -1,11 +1,11 @@ -//############################################################################# -// File: AppDemoGui.h -// Date: Summer 2017 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# +/** + * \file AppDemoGui.h + * \brief C++ Header file for the class AppDemoGui.h + * \date July 2014 + * \note https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 +*/ #ifndef SLGUIDEMO_H #define SLGUIDEMO_H @@ -26,7 +26,7 @@ class SLTexColorLUT; AppDemoGui::build. This build function is passed in the slCreateSceneView and it is called in SLSceneView::onPaint in every frame.
The entire UI is configured and built on every frame. That is why it is called -"Im" for immediate. See also the SLGLImGui class to see how it is minimal +"Im" for immediate. See also the SLImGui class to see how it is minimal integrated in the SLProject.
*/ class AppDemoGui @@ -75,6 +75,7 @@ class AppDemoGui static SLbool showTransform; //!< Flag if transform dialog should be shown static SLbool showDateAndTime; //!< Flag if date-time dialog should be shown static std::time_t adjustedTime; //!< Adjusted GUI time for sun setting (default 0) + static SLstring loadingString; //!< String shown during loading screens private: static void setTransformEditMode(SLScene* s, diff --git a/apps/app_demo/source/AppDemoMain.cpp b/apps/app_demo/source/AppDemoMain.cpp new file mode 100644 index 00000000..b79c3c96 --- /dev/null +++ b/apps/app_demo/source/AppDemoMain.cpp @@ -0,0 +1,308 @@ +/** + * \file AppDemoMain.cpp + * \brief This file has the main function of the demo app of SLProject + * \details An App::Config is set and then passed to the run function defined + * App.h. Beside this is contains the callback functions for the + * scene creation. See App.h for their typedef. + * For more info on how to create a new app with SLProject see: + * https://github.com/cpvrlab/SLProject4/wiki/Creating-a-New-App + * For more info about App framework see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \date July 2024 + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef SL_BUILD_WAI +# include +#else +# include +#endif + +//----------------------------------------------------------------------------- +// Global pointers and functions declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; +bool onUpdateVideo(); + +//----------------------------------------------------------------------------- +static SLSceneView* createSceneView(SLScene* scene, + int curDPI, + SLInputManager& inputManager) +{ + // The sceneview will be deleted by SLScene::~SLScene() + return new AppDemoSceneView(scene, curDPI, inputManager); +} +//----------------------------------------------------------------------------- +static SLScene* createScene(SLSceneID sceneID) +{ + switch (sceneID) + { + case SID_Empty: return new AppDemoSceneEmpty(); + case SID_Figure: return new AppDemoSceneFigure(); + case SID_Minimal: return new AppDemoSceneMinimal(); + case SID_MeshLoad: return new AppDemoSceneMeshLoad(); + case SID_Revolver: return new AppDemoSceneRevolver(); + case SID_TextureBlend: return new AppDemoSceneTextureBlend(); + case SID_TextureFilter: return new AppDemoSceneTextureFilter(); +#ifdef SL_BUILD_WITH_KTX + case SID_TextureCompression: return new AppDemoSceneTextureCompression(); +#endif + case SID_FrustumCull: return new AppDemoSceneFrustum(); + case SID_2Dand3DText: return new AppDemoScene2Dand3DText(); + case SID_PointClouds: return new AppDemoScenePointClouds(); + case SID_ZFighting: return new AppDemoSceneZFighting(); + case SID_ShaderPerVertexBlinn: return new AppDemoSceneShaderBlinn("Per Vertex Blinn-Phong Lighting", true); + case SID_ShaderPerPixelBlinn: return new AppDemoSceneShaderBlinn("Per Pixel Blinn-Phong Lighting", false); + case SID_ShaderPerPixelCook: return new AppDemoSceneShaderCook(); + case SID_ShaderIBL: return new AppDemoSceneShaderIBL(); + case SID_ShaderWave: return new AppDemoSceneShaderWave(); + case SID_ShaderBumpNormal: return new AppDemoSceneShaderBump(); + case SID_ShaderBumpParallax: return new AppDemoSceneShaderParallax(); + case SID_ShaderSkybox: return new AppDemoSceneShaderSkybox(); + case SID_ShaderEarth: return new AppDemoSceneShaderEarth(); + case SID_ShadowMappingBasicScene: return new AppDemoSceneShadowBasic(); + case SID_ShadowMappingLightTypes: return new AppDemoSceneShadowLightTypes(); + case SID_ShadowMappingSpotLights: return new AppDemoSceneShadowLightSpot(); + case SID_ShadowMappingPointLights: return new AppDemoSceneShadowLightPoint(); + case SID_ShadowMappingCascaded: return new AppDemoSceneShadowCascaded(); + case SID_SuzannePerPixBlinn: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and reflection colors", false, false, false, false, false); + case SID_SuzannePerPixBlinnTm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and texture mapping", true, false, false, false, false); + case SID_SuzannePerPixBlinnNm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and normal mapping", false, true, false, false, false); + case SID_SuzannePerPixBlinnAo: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and ambient occlusion", false, false, true, false, false); + case SID_SuzannePerPixBlinnSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and shadow mapping", false, false, false, true, false); + case SID_SuzannePerPixBlinnTmNm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, texture and normal mapping", true, true, false, false, false); + case SID_SuzannePerPixBlinnTmAo: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, texture mapping and ambient occlusion", true, false, true, false, false); + case SID_SuzannePerPixBlinnNmAo: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, normal mapping and ambient occlusion", false, true, true, false, false); + case SID_SuzannePerPixBlinnTmSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, texture mapping and shadow mapping", true, false, false, true, false); + case SID_SuzannePerPixBlinnNmSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, normal mapping and shadow mapping", false, true, false, true, false); + case SID_SuzannePerPixBlinnAoSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting, ambient occlusion and shadow mapping", false, false, true, true, false); + case SID_SuzannePerPixBlinnTmNmAo: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and diffuse, normal, ambient occlusion and shadow mapping", true, true, true, false, false); + case SID_SuzannePerPixBlinnTmNmSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and diffuse, normal and shadow mapping ", true, true, false, true, false); + case SID_SuzannePerPixBlinnTmNmAoSm: return new AppDemoSceneSuzanne("Suzanne with per pixel Blinn-Phong lighting and diffuse, normal, ambient occlusion and shadow mapping", true, true, true, true, false); + case SID_SuzannePerPixCookTmNmAoSmEm: return new AppDemoSceneSuzanne("Suzanne with per pixel Cook-Torrance lighting and diffuse, normal, ambient occlusion, shadow and environment mapping", true, true, true, true, true); + case SID_glTF_DamagedHelmet: + case SID_glTF_FlightHelmet: + case SID_glTF_Sponza: + case SID_glTF_WaterBottle: return new AppDemoSceneGLTF(sceneID); + case SID_Robotics_FanucCRX_FK: return new AppDemoSceneRobot(); + case SID_VolumeRayCast: return new AppDemoSceneVolumeRayCast(); + case SID_VolumeRayCastLighted: return new AppDemoSceneVolumeRayCastLighted(); + case SID_AnimationNode: return new AppDemoSceneAnimNode(); + case SID_AnimationNodeMass: return new AppDemoSceneAnimNodeMass(); + case SID_AnimationSkinned: return new AppDemoSceneAnimSkinned(); + case SID_AnimationSkinnedMass: return new AppDemoSceneAnimSkinnedMass(); + case SID_VideoTextureFile: + case SID_VideoTextureLive: return new AppDemoSceneVideoTexture(sceneID); + case SID_VideoTrackChessMain: + case SID_VideoTrackChessScnd: + case SID_VideoCalibrateMain: + case SID_VideoCalibrateScnd: return new AppDemoSceneVideoTrackChessboard(sceneID); + case SID_VideoTrackArucoMain: + case SID_VideoTrackArucoScnd: return new AppDemoSceneVideoTrackAruco(sceneID); + case SID_VideoTrackFaceMain: + case SID_VideoTrackFaceScnd: return new AppDemoSceneVideoTrackFace(sceneID); + case SID_VideoTrackFeature2DMain: return new AppDemoSceneVideoTrackFeatures(); + case SID_VideoTrackMediaPipeHandsMain: return new AppDemoSceneVideoTrackMediapipe(); + case SID_VideoTrackWAI: return new AppDemoSceneVideoTrackWAI(); + case SID_VideoSensorAR: return new AppDemoSceneVideoSensorAR(); + case SID_ParticleSystem_Simple: return new AppDemoSceneParticleSimple(); + case SID_ParticleSystem_DustStorm: return new AppDemoSceneParticleDustStorm(); + case SID_ParticleSystem_Fountain: return new AppDemoSceneParticleFountain(); + case SID_ParticleSystem_Sun: return new AppDemoSceneParticleSun(); + case SID_ParticleSystem_RingOfFire: return new AppDemoSceneParticleRingOfFire(); + case SID_ParticleSystem_ComplexFire: + case SID_Benchmark_ParticleSystemComplexFire: return new AppDemoSceneParticleComplexFire(sceneID); + case SID_ParticleSystem_Many: return new AppDemoSceneParticleMany(); + case SID_RTSpheres: + case SID_RTSoftShadows: return new AppDemoSceneRTSpheres(sceneID); + case SID_RTMuttenzerBox: return new AppDemoSceneRTMuttenzerBox(); + case SID_RTDoF: return new AppDemoSceneRTDoF(); + case SID_RTLens: return new AppDemoSceneRTLens(); + case SID_Benchmark_JansUniverse: return new AppDemoSceneJansUniverse(); + case SID_Benchmark_NodeAnimations: return new AppDemoSceneAnimNodeMass2(); + case SID_Benchmark_LargeModel: return new AppDemoSceneLargeModel(); + case SID_Benchmark_LotsOfNodes: return new AppDemoSceneLotsOfNodes(); + case SID_Benchmark_ColumnsLOD: + case SID_Benchmark_ColumnsNoLOD: return new AppDemoSceneLevelOfDetail(sceneID); + case SID_Benchmark_SkinnedAnimations: return new AppDemoSceneAnimSkinnedMass2(); + case SID_ErlebAR_BernChristoffel: return new AppDemoSceneErlebARBernChristoffel(); + case SID_ErlebAR_BielBFH: return new AppDemoSceneErlebARBielBFH(); + case SID_ErlebAR_AugustaRauricaTmpTht: return new AppDemoSceneErlebARAugustaTmpTht(); + case SID_ErlebAR_AventicumCigognier: return new AppDemoSceneErlebARAventicumCigognier(); + case SID_ErlebAR_AventicumTheatre: return new AppDemoSceneErlebARAventicumTheater(); + case SID_ErlebAR_AventicumAmphiteatre: return new AppDemoSceneErlebARAventicumAmphitheater(); + case SID_ErlebAR_SutzKirchrain18: return new AppDemoSceneErlebARSutz(); + default: SL_EXIT_MSG("appDemoSwitchScene: Unknown SceneID"); + } +} +//----------------------------------------------------------------------------- +static void onBeforeSceneDelete(SLSceneView* sv, SLScene* s) +{ + // Reset video and trackers + CVCapture::instance()->videoType(VT_NONE); // turn off any video + CVTracked::resetTimes(); // delete all gVideoTracker times + delete gVideoTracker; // delete the tracker deep + gVideoTracker = nullptr; + gVideoTexture = nullptr; // The video texture will be deleted by scene uninit + gVideoTrackedNode = nullptr; // The tracked node will be deleted by scene uninit + + // Clear gui stuff that depends on scene and sceneview + AppDemoGui::clear(); +} +//----------------------------------------------------------------------------- +static void onBeforeSceneLoad(SLSceneView* sv, SLScene* s) +{ + AppDemoGui::loadingString = "Loading ..."; +} +//----------------------------------------------------------------------------- +static void onBeforeSceneAssembly(SLSceneView* sv, SLScene* s) +{ + AppDemoGui::loadingString = "Assembling ..."; +} +//----------------------------------------------------------------------------- +static void onAfterSceneAssembly(SLSceneView* sv, SLScene* s) +{ + if (CVCapture::instance()->videoType() != VT_NONE) + { + if (sv->viewportSameAsVideo()) + { + // Pass a negative value to the start function, so that the + // viewport aspect ratio can be adapted later to the video aspect. + // This will be known after start. + CVCapture::instance()->start(-1.0f); + SLVec2i videoAspect; + videoAspect.x = CVCapture::instance()->captureSize.width; + videoAspect.y = CVCapture::instance()->captureSize.height; + sv->setViewportFromRatio(videoAspect, + sv->viewportAlign(), + true); + } + else + CVCapture::instance()->start(sv->viewportWdivH()); + } +} +//----------------------------------------------------------------------------- +static SLbool onUpdate(SLSceneView* sv) +{ + // If live video image is requested grab it and copy it + if (CVCapture::instance()->videoType() != VT_NONE) + { + float viewportWdivH = sv->viewportWdivH(); + CVCapture::instance()->grabAndAdjustForSL(viewportWdivH); + } + + SLbool trackingGotUpdated = onUpdateVideo(); + return trackingGotUpdated; +} +//----------------------------------------------------------------------------- +// The entry point of our application. +// On most platforms, SL_MAIN_FUNCTION is simply `main`. +// On Android though, there is no `main` function, so we emulate it by setting +// SL_MAIN_FUNCTION to `slAndroidMain`, which creates a "fake" main function. +// This `slAndroidMain` function is then called in the JNI initialization code +// to set up the application configuration. +int SL_MAIN_FUNCTION(int argc, char* argv[]) +{ + App::Config config; + config.argc = argc; + config.argv = argv; + config.windowWidth = 1280; + config.windowHeight = 720; + config.windowTitle = "SLProject Test Application"; + config.numSamples = 4; + config.startSceneID = SL_STARTSCENE; + config.onNewSceneView = createSceneView; + config.onNewScene = createScene; + config.onBeforeSceneDelete = onBeforeSceneDelete; + config.onBeforeSceneLoad = onBeforeSceneLoad; + config.onBeforeSceneAssembly = onBeforeSceneAssembly; + config.onAfterSceneAssembly = onAfterSceneAssembly; + config.onUpdate = onUpdate; + config.onGuiBuild = AppDemoGui::build; + config.onGuiLoadConfig = AppDemoGui::loadConfig; + config.onGuiSaveConfig = AppDemoGui::saveConfig; + + return App::run(config); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/AppDemoSceneID.h b/apps/app_demo/source/AppDemoSceneID.h new file mode 100644 index 00000000..92168979 --- /dev/null +++ b/apps/app_demo/source/AppDemoSceneID.h @@ -0,0 +1,133 @@ +/** + * \file AppDemoSceneID.h + * \brief Definition of scene IDs in the demo app + * \date July 2024 + * \note https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + */ + +#ifndef APPDEMOSCENEID_H +#define APPDEMOSCENEID_H + +#include + +//----------------------------------------------------------------------------- +enum AppDemoSceneID : SLSceneID +{ + SID_FromFile = -2, // Custom asset loaded over menu + SID_Empty = SL_EMPTY_SCENE_ID, // No data in scene + SID_Minimal, + SID_Figure, + SID_MeshLoad, + SID_Revolver, + SID_TextureFilter, + SID_TextureBlend, + SID_TextureCompression, + SID_FrustumCull, + SID_2Dand3DText, + SID_PointClouds, + SID_ZFighting, + + SID_ShaderPerVertexBlinn, + SID_ShaderPerPixelBlinn, + SID_ShaderPerPixelCook, + SID_ShaderIBL, + SID_ShaderWave, + SID_ShaderBumpNormal, + SID_ShaderBumpParallax, + SID_ShaderSkybox, + SID_ShaderEarth, + SID_ShadowMappingBasicScene, + SID_ShadowMappingLightTypes, + SID_ShadowMappingPointLights, + SID_ShadowMappingSpotLights, + SID_ShadowMappingCascaded, + + SID_SuzannePerPixBlinn, + SID_SuzannePerPixBlinnTm, + SID_SuzannePerPixBlinnNm, + SID_SuzannePerPixBlinnAo, + SID_SuzannePerPixBlinnSm, + SID_SuzannePerPixBlinnTmNm, + SID_SuzannePerPixBlinnTmAo, + SID_SuzannePerPixBlinnNmAo, + SID_SuzannePerPixBlinnTmSm, + SID_SuzannePerPixBlinnNmSm, + SID_SuzannePerPixBlinnAoSm, + SID_SuzannePerPixBlinnTmNmAo, + SID_SuzannePerPixBlinnTmNmSm, + SID_SuzannePerPixBlinnTmNmAoSm, + SID_SuzannePerPixCookTmNmAoSmEm, + + SID_glTF_DamagedHelmet, + SID_glTF_FlightHelmet, + SID_glTF_Sponza, + SID_glTF_WaterBottle, + + SID_Robotics_FanucCRX_FK, + + SID_VolumeRayCast, + SID_VolumeRayCastLighted, + + SID_AnimationNode, + SID_AnimationNodeMass, + SID_AnimationSkinned, + SID_AnimationSkinnedMass, + + SID_VideoTextureLive, + SID_VideoTextureFile, + SID_VideoCalibrateMain, + SID_VideoCalibrateScnd, + SID_VideoTrackChessMain, + SID_VideoTrackChessScnd, + SID_VideoTrackArucoMain, + SID_VideoTrackArucoScnd, + SID_VideoTrackFeature2DMain, + SID_VideoTrackFaceMain, + SID_VideoTrackFaceScnd, + SID_VideoTrackMediaPipeHandsMain, + SID_VideoSensorAR, + SID_VideoTrackWAI, + + SID_RTMuttenzerBox, + SID_RTSpheres, + SID_RTSoftShadows, + SID_RTDoF, + SID_RTLens, + + SID_ParticleSystem_Simple, + SID_ParticleSystem_DustStorm, + SID_ParticleSystem_Fountain, + SID_ParticleSystem_Sun, + SID_ParticleSystem_RingOfFire, + SID_ParticleSystem_ComplexFire, + SID_ParticleSystem_Many, + + SID_MaxNoBenchmarks, + + SID_Benchmark_LargeModel, + SID_Benchmark_LotsOfNodes, + SID_Benchmark_NodeAnimations, + SID_Benchmark_SkinnedAnimations, + SID_Benchmark_ColumnsNoLOD, + SID_Benchmark_ColumnsLOD, + SID_Benchmark_JansUniverse, + SID_Benchmark_ParticleSystemComplexFire, + + SID_MaxPublicAssets, + + // These scenes are not part of the public data + SID_ErlebAR_BernChristoffel, + SID_ErlebAR_BielBFH, + SID_ErlebAR_AugustaRauricaTmpTht, + SID_ErlebAR_AventicumAmphiteatre, + SID_ErlebAR_AventicumCigognier, + SID_ErlebAR_AventicumTheatre, + SID_ErlebAR_SutzKirchrain18, + + SID_MaxAll +}; +//----------------------------------------------------------------------------- + +#endif \ No newline at end of file diff --git a/apps/app_demo/source/AppDemoSceneView.cpp b/apps/app_demo/source/AppDemoSceneView.cpp new file mode 100644 index 00000000..9c885442 --- /dev/null +++ b/apps/app_demo/source/AppDemoSceneView.cpp @@ -0,0 +1,97 @@ +/** + * \file AppDemoSceneView.cpp + * \date August 2019 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + * \authors Marcus Hudritsch, Michael Göttlicher + * \copyright http://opensource.org/licenses/GPL-3.0 + */ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneView::AppDemoSceneView(SLScene* s, + int dpi, + SLInputManager& inputManager) + : SLSceneView(s, dpi, inputManager) +{ +} +//----------------------------------------------------------------------------- +/*! This method overrides the same method from the base class SLSceneView. + It runs some app-specific code if a certain key is pressed and calls the + default implementation from SLSceneView if it doesn't consume the event. +*/ +SLbool AppDemoSceneView::onKeyPress(SLKey key, SLKey mod) +{ + // Keyboard shortcuts for next or previous sceneID loading + if (mod & K_alt && mod & K_shift) + { + SLSceneView* sv = AppCommon::sceneViews[0]; + if (key == '0' && sv) + { + AppCommon::sceneToLoad = SID_Empty; + return true; + } + else if (key == K_left && sv && + AppCommon::sceneID > 0 && + AppCommon::sceneID < SID_MaxNoBenchmarks) + { + AppCommon::sceneToLoad = static_cast(AppCommon::sceneID - 1); + return true; + } + else if (key == K_right && sv && + AppCommon::sceneID < SID_MaxNoBenchmarks - 1) + { + AppCommon::sceneToLoad = static_cast(AppCommon::sceneID + 1); + return true; + } + } + + return SLSceneView::onKeyPress(key, mod); +} +//----------------------------------------------------------------------------- +/*! This method overrides the same method from the base class SLSceneView. + It runs some app-specific code if a certain key is pressed and calls the + default implementation from SLSceneView if it doesn't consume the event. +*/ +SLbool AppDemoSceneView::onKeyRelease(SLKey key, SLKey mod) +{ + if (AppDemoGui::hideUI) + { + AppDemoGui::hideUI = false; + return true; + } + + return SLSceneView::onKeyRelease(key, mod); +} +//----------------------------------------------------------------------------- +/*! This method overrides the same method from the base class SLSceneView. + Most events such as all mouse and keyboard events from the OS are forwarded to + SLSceneview. SLSceneview implements a default behaviour. If you want a + different or additional behaviour for a certain eventhandler you have to sub- + class SLSceneView and override the eventhandler. + Because all video processing (capturing and calibration) is handled outside + of the core SLProject we need to add an additional handling for mouse down + withing the calibration routine. + */ +SLbool AppDemoSceneView::onMouseDown(SLMouseButton button, + SLint x, + SLint y, + SLKey mod) +{ + // Call base class event-handler for default mouse and touchdown behaviour + bool baseClassResult = SLSceneView::onMouseDown(button, x, y, mod); + + // Grab image during calibration if calibration stream is running + if (AppCommon::sceneID == SID_VideoCalibrateMain || + AppCommon::sceneID == SID_VideoCalibrateScnd) + { + grab = true; + } + + return baseClassResult; +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo_slproject/include/AppDemoSceneView.h b/apps/app_demo/source/AppDemoSceneView.h similarity index 53% rename from apps/app_demo_slproject/include/AppDemoSceneView.h rename to apps/app_demo/source/AppDemoSceneView.h index d12d4d04..dee748e9 100644 --- a/apps/app_demo_slproject/include/AppDemoSceneView.h +++ b/apps/app_demo/source/AppDemoSceneView.h @@ -1,11 +1,11 @@ -//############################################################################# -// File: AppDemoSceneView.h -// Date: August 2019 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# +/** + * \file AppDemoSceneView.h + * \date August 2019 + * \authors Marcus Hudritsch + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ #include @@ -20,7 +20,9 @@ class AppDemoSceneView : public SLSceneView public: AppDemoSceneView(SLScene* s, int dpi, SLInputManager& inputManager); - // From SLSceneView overwritten + // Overwritten from SLSceneView + SLbool onKeyPress(SLKey key, SLKey mod) final; + SLbool onKeyRelease(SLKey key, SLKey mod) final; SLbool onMouseDown(SLMouseButton button, SLint x, SLint y, SLKey mod) final; SLbool grab = false; }; diff --git a/apps/app_demo_slproject/source/AppDemoVideo.cpp b/apps/app_demo/source/AppDemoVideo.cpp similarity index 58% rename from apps/app_demo_slproject/source/AppDemoVideo.cpp rename to apps/app_demo/source/AppDemoVideo.cpp index b38fe38c..a119e159 100644 --- a/apps/app_demo_slproject/source/AppDemoVideo.cpp +++ b/apps/app_demo/source/AppDemoVideo.cpp @@ -1,12 +1,16 @@ -//############################################################################# -// File: AppDemoVideo.cpp -// Date: August 2019 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# +/** + * \file AppDemoVideo.cpp + * \brief All video capturing and video tracking functions are in here. + * \date August 2019 + * \authors Marcus Hudritsch + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ +#include +#include +#include #include #include #include @@ -14,8 +18,6 @@ #include #include #include -#include -#include #include #include @@ -25,29 +27,33 @@ //----------------------------------------------------------------------------- /*! Global pointer for the video texture defined in AppDemoLoad for video scenes - It gets updated in the following onUpdateTracking routine */ -SLGLTexture* videoTexture = nullptr; + It gets updated in the following onUpdateTracking routine + */ +SLGLTexture* gVideoTexture = nullptr; -/*! Global pointer for a tracker that is set in AppDemoLoad for video scenes - It gets updated in the following onUpdateTracking routine */ -CVTracked* tracker = nullptr; +/*! Global pointer for a gVideoTracker that is set in AppDemoLoad for video scenes + It gets updated in the following onUpdateTracking routine + */ +CVTracked* gVideoTracker = nullptr; -/*! Global pointer to a node that from witch the tracker changes the pose. - it gets updated in the following onUpdateTracking routine */ -SLNode* trackedNode = nullptr; +/*! Global pointer to a node that from witch the gVideoTracker changes the pose. + it gets updated in the following onUpdateTracking routine + */ +SLNode* gVideoTrackedNode = nullptr; //----------------------------------------------------------------------------- -// always update scene camera fovV from calibration because the calibration may have -// been adapted in adjustForSL after a change of aspect ratio! -// Attention: The active scene view camera may be a different one that the tracking camera -// but we have to update the tracking camera only! +/*! always update scene camera fovV from calibration because the calibration + may have been adapted in adjustForSL after a change of aspect ratio! + Attention: The active scene view camera may be a different one that the + tracking camera but we have to update the tracking camera only! +*/ void updateTrackingSceneCamera(CVCamera* ac) { PROFILE_FUNCTION(); - if (trackedNode && typeid(*trackedNode) == typeid(SLCamera)) + if (gVideoTrackedNode && typeid(*gVideoTrackedNode) == typeid(SLCamera)) { - SLCamera* trackingCam = dynamic_cast(trackedNode); + SLCamera* trackingCam = dynamic_cast(gVideoTrackedNode); trackingCam->fov(ac->calibration.cameraFovVDeg()); } } @@ -61,76 +67,76 @@ void runCalibrationEstimator(CVCamera* ac, SLScene* s, SLSceneView* sv) static bool processedCalibResult = false; try { - if (!AppDemo::calibrationEstimator) + if (!AppCommon::calibrationEstimator) { - AppDemo::calibrationEstimator = new CVCalibrationEstimator(AppDemo::calibrationEstimatorParams, - CVCapture::instance()->activeCamSizeIndex, - ac->mirrorH(), - ac->mirrorV(), - ac->type(), - Utils::ComputerInfos::get(), - AppDemo::calibIniPath, - AppDemo::externalPath, - AppDemo::exePath); + AppCommon::calibrationEstimator = new CVCalibrationEstimator(AppCommon::calibrationEstimatorParams, + CVCapture::instance()->activeCamSizeIndex, + ac->mirrorH(), + ac->mirrorV(), + ac->type(), + Utils::ComputerInfos::get(), + AppCommon::calibIniPath, + AppCommon::externalPath, + AppCommon::exePath); // clear grab request from sceneview adSv->grab = false; processedCalibResult = false; } - if (AppDemo::calibrationEstimator->isStreaming()) + if (AppCommon::calibrationEstimator->isStreaming()) { - AppDemo::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, - CVCapture::instance()->lastFrameGray, - adSv->grab); + AppCommon::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, + CVCapture::instance()->lastFrameGray, + adSv->grab); // reset grabbing switch adSv->grab = false; stringstream ss; ss << "Click on the screen to create a calibration photo. Created " - << AppDemo::calibrationEstimator->numCapturedImgs() - << " of " << AppDemo::calibrationEstimator->numImgsToCapture(); + << AppCommon::calibrationEstimator->numCapturedImgs() + << " of " << AppCommon::calibrationEstimator->numImgsToCapture(); s->info(ss.str()); } - else if (AppDemo::calibrationEstimator->isBusyExtracting()) + else if (AppCommon::calibrationEstimator->isBusyExtracting()) { // also reset grabbing, user has to click again adSv->grab = false; - AppDemo::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, - CVCapture::instance()->lastFrameGray, - false); + AppCommon::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, + CVCapture::instance()->lastFrameGray, + false); s->info("Busy extracting corners, please wait with grabbing ..."); } - else if (AppDemo::calibrationEstimator->isCalculating()) + else if (AppCommon::calibrationEstimator->isCalculating()) { - AppDemo::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, - CVCapture::instance()->lastFrameGray, - false); + AppCommon::calibrationEstimator->updateAndDecorate(CVCapture::instance()->lastFrame, + CVCapture::instance()->lastFrameGray, + false); s->info("Calculating calibration, please wait ..."); } - else if (AppDemo::calibrationEstimator->isDone()) + else if (AppCommon::calibrationEstimator->isDone()) { if (!processedCalibResult) { - if (AppDemo::calibrationEstimator->calibrationSuccessful()) + if (AppCommon::calibrationEstimator->calibrationSuccessful()) { processedCalibResult = true; - ac->calibration = AppDemo::calibrationEstimator->getCalibration(); + ac->calibration = AppCommon::calibrationEstimator->getCalibration(); std::string computerInfo = Utils::ComputerInfos::get(); string mainCalibFilename = "camCalib_" + computerInfo + "_main.xml"; string scndCalibFilename = "camCalib_" + computerInfo + "_scnd.xml"; std::string errorMsg; - if (ac->calibration.save(AppDemo::calibFilePath, mainCalibFilename)) + if (ac->calibration.save(AppCommon::calibFilePath, mainCalibFilename)) { #ifndef SL_EMSCRIPTEN - if (!FtpUtils::uploadFile(AppDemo::calibFilePath, + if (!FtpUtils::uploadFile(AppCommon::calibFilePath, mainCalibFilename, - AppDemo::CALIB_FTP_HOST, - AppDemo::CALIB_FTP_USER, - AppDemo::CALIB_FTP_PWD, - AppDemo::CALIB_FTP_DIR, + AppCommon::CALIB_FTP_HOST, + AppCommon::CALIB_FTP_USER, + AppCommon::CALIB_FTP_PWD, + AppCommon::CALIB_FTP_DIR, errorMsg)) { Utils::log("WAIApp", errorMsg.c_str()); @@ -150,7 +156,7 @@ void runCalibrationEstimator(CVCamera* ac, SLScene* s, SLSceneView* sv) } } } - else if (AppDemo::calibrationEstimator->isDoneCaptureAndSave()) + else if (AppCommon::calibrationEstimator->isDoneCaptureAndSave()) { s->info(("Capturing done!")); } @@ -169,18 +175,18 @@ void ensureValidCalibration(CVCamera* ac, SLSceneView* sv) PROFILE_FUNCTION(); // we have to make sure calibration process is stopped if someone stops calibrating - if (AppDemo::calibrationEstimator) + if (AppCommon::calibrationEstimator) { - delete AppDemo::calibrationEstimator; - AppDemo::calibrationEstimator = nullptr; + delete AppCommon::calibrationEstimator; + AppCommon::calibrationEstimator = nullptr; } if (ac->calibration.state() == CS_uncalibrated) { // Try to read device lens and sensor information - string strF = AppDemo::deviceParameter["DeviceLensFocalLength"]; - string strW = AppDemo::deviceParameter["DeviceSensorPhysicalSizeW"]; - string strH = AppDemo::deviceParameter["DeviceSensorPhysicalSizeH"]; + string strF = AppCommon::deviceParameter["DeviceLensFocalLength"]; + string strW = AppCommon::deviceParameter["DeviceSensorPhysicalSizeW"]; + string strH = AppCommon::deviceParameter["DeviceSensorPhysicalSizeH"]; if (!strF.empty() && !strW.empty() && !strH.empty()) { float devF = strF.empty() ? 0.0f : stof(strF); @@ -221,11 +227,11 @@ bool onUpdateVideo() { PROFILE_FUNCTION(); - if (AppDemo::sceneViews.empty()) + if (AppCommon::sceneViews.empty()) return false; - SLScene* s = AppDemo::scene; - SLSceneView* sv = AppDemo::sceneViews[0]; + SLScene* s = AppCommon::scene; + SLSceneView* sv = AppCommon::sceneViews[0]; if (CVCapture::instance()->videoType() != VT_NONE && !CVCapture::instance()->lastFrame.empty()) @@ -234,8 +240,8 @@ bool onUpdateVideo() CVCamera* ac = CVCapture::instance()->activeCamera; - if (AppDemo::sceneID == SID_VideoCalibrateMain || - AppDemo::sceneID == SID_VideoCalibrateScnd) + if (AppCommon::sceneID == SID_VideoCalibrateMain || + AppCommon::sceneID == SID_VideoCalibrateScnd) { runCalibrationEstimator(ac, s, sv); } @@ -248,16 +254,16 @@ bool onUpdateVideo() // but we have to update the tracking camera only! updateTrackingSceneCamera(ac); - if (tracker && trackedNode) + if (gVideoTracker && gVideoTrackedNode) { - bool foundPose = tracker->track(CVCapture::instance()->lastFrameGray, - CVCapture::instance()->lastFrame, - &ac->calibration); + bool foundPose = gVideoTracker->track(CVCapture::instance()->lastFrameGray, + CVCapture::instance()->lastFrame, + &ac->calibration); if (foundPose) { // clang-format off // convert matrix type CVMatx44f to SLMat4f - CVMatx44f cvOVM = tracker->objectViewMat(); + CVMatx44f cvOVM = gVideoTracker->objectViewMat(); SLMat4f glOVM(cvOVM.val[0], cvOVM.val[1], cvOVM.val[2], cvOVM.val[3], cvOVM.val[4], cvOVM.val[5], cvOVM.val[6], cvOVM.val[7], cvOVM.val[8], cvOVM.val[9], cvOVM.val[10],cvOVM.val[11], @@ -266,27 +272,27 @@ bool onUpdateVideo() // set the object matrix depending if the // tracked node is attached to a camera or not - if (typeid(*trackedNode) == typeid(SLCamera)) + if (typeid(*gVideoTrackedNode) == typeid(SLCamera)) { - trackedNode->om(glOVM.inverted()); - trackedNode->setDrawBitsRec(SL_DB_HIDDEN, true); + gVideoTrackedNode->om(glOVM.inverted()); + gVideoTrackedNode->setDrawBitsRec(SL_DB_HIDDEN, true); } else { // see comments in CVTracked::calcObjectMatrix - trackedNode->om(sv->camera()->om() * glOVM); - trackedNode->setDrawBitsRec(SL_DB_HIDDEN, false); + gVideoTrackedNode->om(sv->camera()->om() * glOVM); + gVideoTrackedNode->setDrawBitsRec(SL_DB_HIDDEN, false); } } else - trackedNode->setDrawBitsRec(SL_DB_HIDDEN, false); + gVideoTrackedNode->setDrawBitsRec(SL_DB_HIDDEN, false); } // Update info text only for chessboard scene - if (AppDemo::sceneID == SID_VideoCalibrateMain || - AppDemo::sceneID == SID_VideoCalibrateScnd || - AppDemo::sceneID == SID_VideoTrackChessMain || - AppDemo::sceneID == SID_VideoTrackChessScnd) + if (AppCommon::sceneID == SID_VideoCalibrateMain || + AppCommon::sceneID == SID_VideoCalibrateScnd || + AppCommon::sceneID == SID_VideoTrackChessMain || + AppCommon::sceneID == SID_VideoTrackChessScnd) { SLfloat fovH = ac->calibration.cameraFovHDeg(); SLfloat err = ac->calibration.reprojectionError(); @@ -302,30 +308,30 @@ bool onUpdateVideo() //................................................................... // copy image to video texture - if (videoTexture) + if (gVideoTexture) { if (ac->calibration.state() == CS_calibrated && ac->showUndistorted()) { CVMat undistorted; ac->calibration.remap(CVCapture::instance()->lastFrame, undistorted); - // CVCapture::instance()->videoTexture()->copyVideoImage(undistorted.cols, - videoTexture->copyVideoImage(undistorted.cols, - undistorted.rows, - CVCapture::instance()->format, - undistorted.data, - undistorted.isContinuous(), - true); + // CVCapture::instance()->gVideoTexture()->copyVideoImage(undistorted.cols, + gVideoTexture->copyVideoImage(undistorted.cols, + undistorted.rows, + CVCapture::instance()->format, + undistorted.data, + undistorted.isContinuous(), + true); } else { - // CVCapture::instance()->videoTexture()->copyVideoImage(CVCapture::instance()->lastFrame.cols, - videoTexture->copyVideoImage(CVCapture::instance()->lastFrame.cols, - CVCapture::instance()->lastFrame.rows, - CVCapture::instance()->format, - CVCapture::instance()->lastFrame.data, - CVCapture::instance()->lastFrame.isContinuous(), - true); + // CVCapture::instance()->gVideoTexture()->copyVideoImage(CVCapture::instance()->lastFrame.cols, + gVideoTexture->copyVideoImage(CVCapture::instance()->lastFrame.cols, + CVCapture::instance()->lastFrame.rows, + CVCapture::instance()->format, + CVCapture::instance()->lastFrame.data, + CVCapture::instance()->lastFrame.isContinuous(), + true); } } else diff --git a/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.cpp b/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.cpp new file mode 100644 index 00000000..ae064d12 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.cpp @@ -0,0 +1,119 @@ +/** + * \file AppDemoScene2Dand3DText.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoScene2Dand3DText::AppDemoScene2Dand3DText() : SLScene("2D and 3D Text Scene") +{ + info("All 3D objects are in the _root3D scene and the center text is in the _root2D scene " + "and rendered in orthographic projection in screen space."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoScene2Dand3DText::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoScene2Dand3DText::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* m1 = new SLMaterial(am, "m1", SLCol4f::RED); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 0, 5); + cam1->lookAt(0, 0, 0); + cam1->focalDist(5); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 10, + 10, + 10, + 0.3f); + light1->powers(0.2f, 0.8f, 1.0f); + light1->attenuation(1, 0, 0); + + // Because all text objects get their sizes in pixels we have to scale them down + SLfloat scale = 0.01f; + SLstring txt = "This is text in 3D with font07"; + SLVec2f size = SLAssetManager::font07->calcTextSize(txt); + SLNode* t07 = new SLText(txt, SLAssetManager::font07); + t07->translate(-size.x * 0.5f * scale, 1.0f, 0); + t07->scale(scale); + + txt = "This is text in 3D with font09"; + size = SLAssetManager::font09->calcTextSize(txt); + SLNode* t09 = new SLText(txt, SLAssetManager::font09); + t09->translate(-size.x * 0.5f * scale, 0.8f, 0); + t09->scale(scale); + + txt = "This is text in 3D with font12"; + size = SLAssetManager::font12->calcTextSize(txt); + SLNode* t12 = new SLText(txt, SLAssetManager::font12); + t12->translate(-size.x * 0.5f * scale, 0.6f, 0); + t12->scale(scale); + + txt = "This is text in 3D with font20"; + size = SLAssetManager::font20->calcTextSize(txt); + SLNode* t20 = new SLText(txt, SLAssetManager::font20); + t20->translate(-size.x * 0.5f * scale, -0.8f, 0); + t20->scale(scale); + + txt = "This is text in 3D with font22"; + size = SLAssetManager::font22->calcTextSize(txt); + SLNode* t22 = new SLText(txt, SLAssetManager::font22); + t22->translate(-size.x * 0.5f * scale, -1.2f, 0); + t22->scale(scale); + + // Now create 2D text but don't scale it (all sizes in pixels) + txt = "This is text in 2D with font16"; + size = SLAssetManager::font16->calcTextSize(txt); + SLNode* t2D16 = new SLText(txt, SLAssetManager::font16); + t2D16->translate(-size.x * 0.5f, 0, 0); + + // Assemble 3D scene as usual with camera and light + SLNode* scene3D = new SLNode("root3D"); + this->root3D(scene3D); + scene3D->addChild(cam1); + scene3D->addChild(light1); + scene3D->addChild(new SLNode(new SLSphere(am, + 0.5f, + 32, + 32, + "Sphere", + m1))); + scene3D->addChild(t07); + scene3D->addChild(t09); + scene3D->addChild(t12); + scene3D->addChild(t20); + scene3D->addChild(t22); + + // Assemble 2D scene + SLNode* scene2D = new SLNode("root2D"); + this->root2D(scene2D); + scene2D->addChild(t2D16); + + sv->camera(cam1); + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.h b/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.h new file mode 100644 index 00000000..7e64fb1d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoScene2Dand3DText.h @@ -0,0 +1,46 @@ +/** + * \file AppDemoScene2Dand3DText.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENE2DAND3DTEXT_H +#define APPDEMOSCENE2DAND3DTEXT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for 2D and 3D text scene +class AppDemoScene2Dand3DText : public SLScene +{ +public: + AppDemoScene2Dand3DText(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNode.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimNode.cpp new file mode 100644 index 00000000..93bff3d6 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNode.cpp @@ -0,0 +1,137 @@ +/** + * \file AppDemoSceneAnimNode.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimNode::AppDemoSceneAnimNode() + : SLScene("Node Animation Test Scene") +{ + info("Node animations with different easing curves."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimNode::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_tex1, + AppCommon::texturePath + "Checkerboard0512_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimNode::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* m1 = new SLMaterial(am, "m1", _tex1); + m1->kr(0.5f); + SLMaterial* m2 = new SLMaterial(am, "m2", SLCol4f::WHITE * 0.5, SLCol4f::WHITE, 128, 0.5f, 0.0f, 1.0f); + + SLMesh* floorMesh = new SLRectangle(am, SLVec2f(-5, -5), SLVec2f(5, 5), 20, 20, "FloorMesh", m1); + SLNode* floorRect = new SLNode(floorMesh); + floorRect->rotate(90, -1, 0, 0); + floorRect->translate(0, 0, -5.5f); + + // Bouncing balls + SLNode* ball1 = new SLNode(new SLSphere(am, 0.3f, 16, 16, "Ball1", m2)); + ball1->translate(0, 0, 4, TS_object); + SLAnimation* ball1Anim = animManager().createNodeAnimation("Ball1_anim", 1.0f, true, EC_linear, AL_pingPongLoop); + ball1Anim->createNodeAnimTrackForTranslation(ball1, SLVec3f(0.0f, -5.2f, 0.0f)); + + SLNode* ball2 = new SLNode(new SLSphere(am, 0.3f, 16, 16, "Ball2", m2)); + ball2->translate(-1.5f, 0, 4, TS_object); + SLAnimation* ball2Anim = animManager().createNodeAnimation("Ball2_anim", 1.0f, true, EC_inQuad, AL_pingPongLoop); + ball2Anim->createNodeAnimTrackForTranslation(ball2, SLVec3f(0.0f, -5.2f, 0.0f)); + + SLNode* ball3 = new SLNode(new SLSphere(am, 0.3f, 16, 16, "Ball3", m2)); + ball3->translate(-2.5f, 0, 4, TS_object); + SLAnimation* ball3Anim = animManager().createNodeAnimation("Ball3_anim", 1.0f, true, EC_outQuad, AL_pingPongLoop); + ball3Anim->createNodeAnimTrackForTranslation(ball3, SLVec3f(0.0f, -5.2f, 0.0f)); + + SLNode* ball4 = new SLNode(new SLSphere(am, 0.3f, 16, 16, "Ball4", m2)); + ball4->translate(1.5f, 0, 4, TS_object); + SLAnimation* ball4Anim = animManager().createNodeAnimation("Ball4_anim", 1.0f, true, EC_inOutQuad, AL_pingPongLoop); + ball4Anim->createNodeAnimTrackForTranslation(ball4, SLVec3f(0.0f, -5.2f, 0.0f)); + + SLNode* ball5 = new SLNode(new SLSphere(am, 0.3f, 16, 16, "Ball5", m2)); + ball5->translate(2.5f, 0, 4, TS_object); + SLAnimation* ball5Anim = animManager().createNodeAnimation("Ball5_anim", 1.0f, true, EC_outInQuad, AL_pingPongLoop); + ball5Anim->createNodeAnimTrackForTranslation(ball5, SLVec3f(0.0f, -5.2f, 0.0f)); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 22); + cam1->lookAt(0, 0, 0); + cam1->focalDist(22); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLCamera* cam2 = new SLCamera("Camera 2"); + cam2->translation(5, 0, 0); + cam2->lookAt(0, 0, 0); + cam2->focalDist(5); + cam2->clipFar(10); + cam2->background().colors(SLCol4f(0, 0, 0.6f), SLCol4f(0, 0, 0.3f)); + cam2->setInitialState(); + cam2->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLCamera* cam3 = new SLCamera("Camera 3"); + cam3->translation(-5, -2, 0); + cam3->lookAt(0, 0, 0); + cam3->focalDist(5); + cam3->clipFar(10); + cam3->background().colors(SLCol4f(0.6f, 0, 0), SLCol4f(0.3f, 0, 0)); + cam3->setInitialState(); + cam3->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light1 = new SLLightSpot(am, this, 0, 2, 0, 0.5f); + light1->powers(0.2f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + SLAnimation* light1Anim = this->animManager().createNodeAnimation("Light1_anim", 4.0f); + light1Anim->createNodeAnimTrackForEllipse(light1, 6, A_z, 6, A_x); + + SLLightSpot* light2 = new SLLightSpot(am, this, 0, 0, 0, 0.2f); + light2->powers(0.1f, 1.0f, 1.0f); + light2->attenuation(1, 0, 0); + light2->translate(-8, -4, 0, TS_world); + light2->setInitialState(); + SLAnimation* light2Anim = animManager().createNodeAnimation("light2_anim", 2.0f, true, EC_linear, AL_pingPongLoop); + SLNodeAnimTrack* track = light2Anim->createNodeAnimTrack(); + track->animatedNode(light2); + track->createNodeKeyframe(0.0f); + track->createNodeKeyframe(1.0f)->translation(SLVec3f(8, 8, 0)); + track->createNodeKeyframe(2.0f)->translation(SLVec3f(16, 0, 0)); + track->translationInterpolation(AI_bezier); + + SLNode* figure = AppDemoSceneFigure::BuildFigureGroup(am, this, m2, true); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(cam1); + scene->addChild(cam2); + scene->addChild(cam3); + scene->addChild(floorRect); + scene->addChild(ball1); + scene->addChild(ball2); + scene->addChild(ball3); + scene->addChild(ball4); + scene->addChild(ball5); + scene->addChild(figure); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNode.h b/apps/app_demo/source/scenes/AppDemoSceneAnimNode.h new file mode 100644 index 00000000..806ee395 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNode.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneAnimNode.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMNODE_H +#define APPDEMOSCENEANIMNODE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for node animation test scene +class AppDemoSceneAnimNode : public SLScene +{ +public: + AppDemoSceneAnimNode(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex1; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.cpp new file mode 100644 index 00000000..1952bd18 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.cpp @@ -0,0 +1,127 @@ +/** + * \file AppDemoSceneAnimNodeMass.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimNodeMass::AppDemoSceneAnimNodeMass() + : SLScene("Mass Animation Test Scene") +{ + info("Performance test for transform updates from many animations."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimNodeMass::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimNodeMass::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create and add camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 20, 40); + cam1->lookAt(0, 0, 0); + cam1->focalDist(42); + scene->addChild(cam1); + sv->camera(cam1); + + // Add spotlight + SLLightSpot* light1 = new SLLightSpot(am, this, 0.1f); + light1->translate(0, 10, 0); + scene->addChild(light1); + + // build a basic scene to have a reference for the occurring rotations + SLMaterial* genericMat = new SLMaterial(am, "some material"); + + // we use the same mesh to visualize all the nodes + SLBox* box = new SLBox(am, + -0.5f, + -0.5f, + -0.5f, + 0.5f, + 0.5f, + 0.5f, + "box", + genericMat); + + // We build a stack of levels, each level has a grid of boxes on it + // each box on this grid has another grid above it with child nodes. + // Best results are achieved if gridSize is an uneven number. + // (gridSize^2)^levels = num nodes. handle with care. + const SLint levels = 3; + const SLint gridSize = 3; + const SLint gridHalf = gridSize / 2; + const SLint nodesPerLvl = gridSize * gridSize; + + // node spacing per level + // nodes are 1^3 in size, we want to space the levels so that the densest levels meet + // (so exactly 1 unit spacing between blocks) + SLfloat nodeSpacing[levels]; + for (SLint i = 0; i < levels; ++i) + nodeSpacing[(levels - 1) - i] = (SLfloat)pow((SLfloat)gridSize, (SLfloat)i); + + // lists to keep track of previous grid level to set parents correctly + vector parents; + vector curParentsVector; + + // first parent is the scene root + parents.push_back(scene); + + SLint nodeIndex = 0; + for (float lvl : nodeSpacing) + { + curParentsVector = parents; + parents.clear(); + + // for each parent in the previous level, add a completely new grid + for (auto parent : curParentsVector) + { + for (SLint i = 0; i < nodesPerLvl; ++i) + { + SLNode* node = new SLNode("MassAnimNode"); + node->addMesh(box); + parent->addChild(node); + parents.push_back(node); + + // position + SLfloat x = (SLfloat)(i % gridSize - gridHalf); + SLfloat z = (SLfloat)((i > 0) ? i / gridSize - gridHalf : -gridHalf); + SLVec3f pos(x * lvl * 1.1f, 1.5f, z * lvl * 1.1f); + + node->translate(pos, TS_object); + // node->scale(1.1f); + + SLfloat duration = 1.0f + 5.0f * ((SLfloat)i / (SLfloat)nodesPerLvl); + ostringstream oss; + + oss << "random anim " << nodeIndex++; + SLAnimation* anim = animManager().createNodeAnimation(oss.str(), + duration, + true, + EC_inOutSine, + AL_pingPongLoop); + anim->createNodeAnimTrackForTranslation(node, + SLVec3f(0.0f, 1.0f, 0.0f)); + } + } + } +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.h b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.h new file mode 100644 index 00000000..9b5376a7 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass.h @@ -0,0 +1,48 @@ +/** + * \file AppDemoSceneAnimNodeMass.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMMASS_H +#define APPDEMOSCENEANIMMASS_H + +#include + +//----------------------------------------------------------------------------- +//! Class for mass animation test scene +class AppDemoSceneAnimNodeMass : public SLScene +{ +public: + AppDemoSceneAnimNodeMass(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.cpp new file mode 100644 index 00000000..04a77645 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.cpp @@ -0,0 +1,183 @@ +/** + * \file AppDemoSceneAnimNodeMass2.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimNodeMass2::AppDemoSceneAnimNodeMass2() + : SLScene("Benchmark Node Animation Test Scene") +{ + info("Performance test for transform updates from many animations."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimNodeMass2::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimNodeMass2::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 2.5f, 20); + cam1->focalDist(20); + cam1->lookAt(0, 2.5f, 0); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 15, + 15, + 15, + 0.3f); + light1->powers(0.2f, 0.8f, 1.0f); + light1->attenuation(1, 0, 0); + + SLNode* scene = new SLNode; + root3D(scene); + scene->addChild(cam1); + scene->addChild(light1); + + // Generate NUM_MAT materials + const int NUM_MAT = 20; + SLVMaterial mat; + for (int i = 0; i < NUM_MAT; ++i) + { + SLGLTexture* texC = new SLGLTexture(am, + AppCommon::texturePath + "earth2048_C_Q95.jpg"); // color map + + SLstring matName = "mat-" + std::to_string(i); + mat.push_back(new SLMaterial(am, + matName.c_str(), + texC)); + SLCol4f color; + color.hsva2rgba(SLVec4f(Utils::TWOPI * (float)i / (float)NUM_MAT, + 1.0f, + 1.0f)); + mat[i]->diffuse(color); + } + + // create rotating sphere group + SLint maxDepth = 5; + + SLint resolution = 18; + scene->addChild(RotatingSpheres(am, + this, + maxDepth, + 0, + 0, + 0, + 1, + resolution, + mat)); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- +//! Creates a recursive rotating sphere group used for performance test +/*! + * This performance benchmark is expensive in terms of world matrix updates + * because all sphere groups rotate. Therefore all children need to update + * their wm every frame. + * \param am Pointer to the asset manager + * \param s Pointer to project scene aka asset manager + * \param depth Max. allowed recursion depth + * \param x Position in x direction + * \param y Position in y direction + * \param z Position in z direction + * \param scale Scale factor > 0 and < 1 for the children spheres + * \param resolution NO. of stack and slices of the spheres + * \param mat Reference to an vector of materials + * \return Group node of spheres + */ +SLNode* AppDemoSceneAnimNodeMass2::RotatingSpheres(SLAssetManager* am, + SLScene* s, + SLint depth, + SLfloat x, + SLfloat y, + SLfloat z, + SLfloat scale, + SLuint resolution, + SLVMaterial& mat) +{ + assert(depth >= 0); + assert(scale >= 0.0f && scale <= 1.0f); + assert(resolution > 0 && resolution < 64); + + // Choose the material index randomly + SLint iMat = Utils::random(0, (int)mat.size() - 1); + + // Generate unique names for meshes, nodes and animations + static int sphereNum = 0; + string meshName = "Mesh" + std::to_string(sphereNum); + string animName = "Anim" + std::to_string(sphereNum); + string nodeName = "Node" + std::to_string(sphereNum); + sphereNum++; + + SLAnimation* nodeAnim = s->animManager().createNodeAnimation(animName, + 60, + true, + EC_linear, + AL_loop); + if (depth == 0) + { + SLSphere* sphere = new SLSphere(am, + 5.0f * scale, + resolution, + resolution, + meshName, + mat[iMat]); + SLNode* sphNode = new SLNode(sphere, nodeName); + sphNode->translate(x, y, z, TS_object); + nodeAnim->createNodeAnimTrackForRotation360(sphNode, + SLVec3f(0, 1, 0)); + return sphNode; + } + else + { + depth--; + + // decrease resolution to reduce memory consumption + if (resolution > 8) + resolution -= 2; + + SLNode* sGroup = new SLNode(new SLSphere(am, + 5.0f * scale, + resolution, + resolution, + meshName, + mat[iMat]), + nodeName); + sGroup->translate(x, y, z, TS_object); + nodeAnim->createNodeAnimTrackForRotation360(sGroup, SLVec3f(0, 1, 0)); + sGroup->addChild(RotatingSpheres(am, s, depth, 6.43951f * scale, 0, 1.72546f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, 1.72546f * scale, 0, 6.43951f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, -4.71405f * scale, 0, 4.71405f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, -6.43951f * scale, 0, -1.72546f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, -1.72546f * scale, 0, -6.43951f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, 4.71405f * scale, 0, -4.71405f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, 2.72166f * scale, 5.44331f * scale, 2.72166f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, -3.71785f * scale, 5.44331f * scale, 0.99619f * scale, scale / 3.0f, resolution, mat)); + sGroup->addChild(RotatingSpheres(am, s, depth, 0.99619f * scale, 5.44331f * scale, -3.71785f * scale, scale / 3.0f, resolution, mat)); + return sGroup; + } +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.h b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.h new file mode 100644 index 00000000..fa28090b --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimNodeMass2.h @@ -0,0 +1,38 @@ +/** + * \file AppDemoSceneAnimNodeMass2.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMMASS2_H +#define APPDEMOSCENEANIMMASS2_H + +#include + +//----------------------------------------------------------------------------- +//! Class for benchmark mass animation test scene +class AppDemoSceneAnimNodeMass2 : public SLScene +{ +public: + AppDemoSceneAnimNodeMass2(); + void registerAssetsToLoad(SLAssetLoader& al) override; + void assemble(SLAssetManager* am, SLSceneView* sv) override; + SLNode* RotatingSpheres(SLAssetManager* am, + SLScene* s, + SLint depth, + SLfloat x, + SLfloat y, + SLfloat z, + SLfloat scale, + SLuint resolution, + SLVMaterial& mat); +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.cpp new file mode 100644 index 00000000..8e81f767 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.cpp @@ -0,0 +1,123 @@ +/** + * \file AppDemoSceneAnimSkinned.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimSkinned::AppDemoSceneAnimSkinned() + : SLScene("Skeletal Animation Test Scene") +{ + info("Skeletal Animation Test Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimSkinned::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_char1, + AppCommon::modelPath + + "DAE/AstroBoy/AstroBoy.dae"); + al.addNodeToLoad(_char2, + AppCommon::modelPath + + "GLTF/Sintel/Sintel_LowRes-Rigged.gltf"); + al.addNodeToLoad(_cube1, + AppCommon::modelPath + + "DAE/SkinnedCube/skinnedcube2.dae"); + al.addNodeToLoad(_cube2, + AppCommon::modelPath + + "DAE/SkinnedCube/skinnedcube4.dae"); + al.addNodeToLoad(_cube3, + AppCommon::modelPath + + "DAE/SkinnedCube/skinnedcube5.dae"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimSkinned::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Root scene node + SLNode* scene = new SLNode("scene group"); + root3D(scene); + + // camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 10); + cam1->lookAt(0, 2, 0); + cam1->focalDist(10); + cam1->setInitialState(); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + // light + SLLightSpot* light1 = new SLLightSpot(am, + this, + 10, + 10, + 5, + 0.5f); + light1->powers(0.2f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + scene->addChild(light1); + + // Floor grid + SLMaterial* m2 = new SLMaterial(am, "m2", SLCol4f::WHITE); + SLGrid* grid = new SLGrid(am, + SLVec3f(-5, 0, -5), + SLVec3f(5, 0, 5), + 20, + 20, + "Grid", + m2); + scene->addChild(new SLNode(grid, "grid")); + + // Astro boy character + _char1->translate(-1, 0, 0); + SLAnimPlayback* char1Anim = animManager().animPlaybackByName("unnamed_anim_0"); + char1Anim->playForward(); + scene->addChild(_char1); + + // Sintel character + _char2->translate(1, 0, 0); + SLAnimPlayback* char2Anim = animManager().animPlaybackByName("Wave"); + char2Anim->playForward(); + char2Anim->playbackRate(30); + scene->addChild(_char2); + + // Skinned cube 1 + _cube1->translate(3, 0, 0); + SLAnimPlayback* cube1Anim = animManager().animPlaybackByName("unnamed_anim_2"); + cube1Anim->easing(EC_inOutSine); + cube1Anim->playForward(); + scene->addChild(_cube1); + + // Skinned cube 2 + _cube2->translate(-3, 0, 0); + SLAnimPlayback* cube2Anim = animManager().animPlaybackByName("unnamed_anim_3"); + cube2Anim->easing(EC_inOutSine); + cube2Anim->playForward(); + scene->addChild(_cube2); + + // Skinned cube 3 + _cube3->translate(0, 3, 0); + SLAnimPlayback* cube3Anim = animManager().animPlaybackByName("unnamed_anim_4"); + cube3Anim->loop(AL_pingPongLoop); + cube3Anim->easing(EC_inOutCubic); + cube3Anim->playForward(); + scene->addChild(_cube3); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.h b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.h new file mode 100644 index 00000000..3261517d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinned.h @@ -0,0 +1,53 @@ +/** + * \file AppDemoSceneAnimSkinned.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMSKINNED_H +#define APPDEMOSCENEANIMSKINNED_H + +#include + +//----------------------------------------------------------------------------- +//! Class for skinned animation test scene +class AppDemoSceneAnimSkinned : public SLScene +{ +public: + AppDemoSceneAnimSkinned(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _char1; + SLNode* _char2; + SLNode* _cube1; + SLNode* _cube2; + SLNode* _cube3; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.cpp new file mode 100644 index 00000000..8f6c4183 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.cpp @@ -0,0 +1,106 @@ +/** + * \file AppDemoSceneAnimSkinnedMass.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimSkinnedMass::AppDemoSceneAnimSkinnedMass() + : SLScene("Mass Skeletal Animation Test Scene") +{ + info("Mass Skeletal Animation Test Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimSkinnedMass::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_center, + AppCommon::modelPath + + "DAE/AstroBoy/AstroBoy.dae"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimSkinnedMass::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create materials + SLMaterial* m1 = new SLMaterial(am, "m1", SLCol4f::GRAY); + m1->specular(SLCol4f::BLACK); + + // Define a light + SLLightSpot* light1 = new SLLightSpot(am, + this, + 100, + 40, + 100, + 1); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + + // Define camera + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 10, 10); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Floor rectangle + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-20, -20), + SLVec2f(20, 20), + SLVec2f(0, 0), + SLVec2f(50, 50), + 50, + 50, + "Floor", + m1)); + rect->rotate(90, -1, 0, 0); + + animManager().animPlaybacksBack()->playForward(); + + // Assemble scene + SLNode* scene = new SLNode("scene group"); + root3D(scene); + scene->addChild(light1); + scene->addChild(rect); + scene->addChild(_center); + scene->addChild(cam1); + + std::uniform_real_distribution dist(0.0f, 1.0f); + std::default_random_engine randEngine; + + // create astroboys around the center astroboy + SLint size = 4; + for (SLint iZ = -size; iZ <= size; ++iZ) + { + for (SLint iX = -size; iX <= size; ++iX) + { + SLbool shift = iX % 2 != 0; + if (iX != 0 || iZ != 0) + { + float xt = float(iX) * 1.0f; + float zt = float(iZ) * 1.0f + ((shift) ? 0.5f : 0.0f); + SLNode* n = _center->copyRec(); + n->translate(xt, 0, zt, TS_object); + n->scale(0.75f + 0.5f * dist(randEngine)); + scene->addChild(n); + } + } + } + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.h b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.h new file mode 100644 index 00000000..7603a9ff --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneAnimSkinnedMass.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMSKINNEDMASS_H +#define APPDEMOSCENEANIMSKINNEDMASS_H + +#include + +//----------------------------------------------------------------------------- +//! Class for skinned animation test scene +class AppDemoSceneAnimSkinnedMass : public SLScene +{ +public: + AppDemoSceneAnimSkinnedMass(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _center; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.cpp b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.cpp new file mode 100644 index 00000000..416c5f6c --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.cpp @@ -0,0 +1,116 @@ +/** + * \file AppDemoSceneAnimSkinnedMass2.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneAnimSkinnedMass2::AppDemoSceneAnimSkinnedMass2() + : SLScene("Mass Skeletal Animation Test Scene") +{ + SLchar name[512]; + snprintf(name, + sizeof(name), + "Massive Skinned Animation Benchmark w. %d individual Astroboys", + _size * _size); + info(name); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneAnimSkinnedMass2::registerAssetsToLoad(SLAssetLoader& al) +{ + SLuint iA = 0; + for (SLint iZ = 0; iZ < _size; ++iZ) + { + for (SLint iX = 0; iX < _size; ++iX) + { + al.addNodeToLoad(_astroboy[iA++], + AppCommon::modelPath + + "DAE/AstroBoy/AstroBoy.dae"); + } + } +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneAnimSkinnedMass2::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create materials + SLMaterial* m1 = new SLMaterial(am, "m1", SLCol4f::GRAY); + m1->specular(SLCol4f::BLACK); + + // Define a light + SLLightSpot* light1 = new SLLightSpot(am, + this, + 100, + 40, + 100, + 1); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + + // Define camera + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 30, 0); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Floor rectangle + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-20, -20), + SLVec2f(20, 20), + SLVec2f(0, 0), + SLVec2f(50, 50), + 50, + 50, + "Floor", + m1)); + rect->rotate(90, -1, 0, 0); + + // Assemble scene + SLNode* scene = new SLNode("scene group"); + root3D(scene); + scene->addChild(light1); + scene->addChild(rect); + scene->addChild(cam1); + + // create army with individual astroboys + SLfloat offset = 1.0f; + SLfloat z = (float)(_size - 1) * offset * 0.5f; + SLuint iA = 0; + + for (SLint iZ = 0; iZ < _size; ++iZ) + { + SLfloat x = -(float)(_size - 1) * offset * 0.5f; + + for (SLint iX = 0; iX < _size; ++iX) + { + SLAnimPlayback* anim = animManager().animPlaybackByIndex(iA); + anim->playForward(); + anim->playbackRate(Utils::random(0.5f, 1.5f)); + _astroboy[iA]->translate(x, 0, z, TS_object); + scene->addChild(_astroboy[iA]); + x += offset; + iA++; + } + z -= offset; + } + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.h b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.h new file mode 100644 index 00000000..96728086 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneAnimSkinnedMass2.h @@ -0,0 +1,54 @@ +/** + * \file AppDemoSceneAnimSkinnedMass2.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEANIMSKINNEDMASS2_H +#define APPDEMOSCENEANIMSKINNEDMASS2_H + +#include + +//----------------------------------------------------------------------------- +//! Class for skinned animation test scene +class AppDemoSceneAnimSkinnedMass2 : public SLScene +{ +public: + AppDemoSceneAnimSkinnedMass2(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +#ifdef SL_EMSCRIPTEN + static const SLint _size = 10; +#else + static const SLint _size = 20; +#endif + SLNode* _astroboy[_size*_size]; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneEmpty.cpp b/apps/app_demo/source/scenes/AppDemoSceneEmpty.cpp new file mode 100644 index 00000000..25ae17ec --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneEmpty.cpp @@ -0,0 +1,39 @@ +/** + * \file AppDemoSceneEmpty.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneEmpty::AppDemoSceneEmpty() : SLScene("Empty Scene") +{ + info("No Scene loaded."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneEmpty::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneEmpty::assemble(SLAssetManager* am, SLSceneView* sv) +{ + root3D(nullptr); + + sv->sceneViewCamera()->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + sv->camera(nullptr); + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneEmpty.h b/apps/app_demo/source/scenes/AppDemoSceneEmpty.h new file mode 100644 index 00000000..91526910 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneEmpty.h @@ -0,0 +1,45 @@ +/** + * \file AppDemoSceneEmpty.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEEMPTY_H +#define APPDEMOSCENEEMPTY_H + +#include + +//----------------------------------------------------------------------------- +class AppDemoSceneEmpty : public SLScene +{ +public: + AppDemoSceneEmpty(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.cpp new file mode 100644 index 00000000..3bd8bf7f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.cpp @@ -0,0 +1,263 @@ +/** + * \file AppDemoSceneErlebARAugustaTmpTht.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARAugustaTmpTht::AppDemoSceneErlebARAugustaTmpTht() + : SLScene("Augusta Raurica Temple and Theatre AR") +{ + info("Augusta Raurica Temple AR and Theatre"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARAugustaTmpTht::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_thtAndTmp, + AppCommon::dataPath + + "erleb-AR/models/augst/augst-thtL1L2-tmpL1L2.gltf"); + + al.addProgramToLoad(_spRefl, + AppCommon::shaderPath + "Reflect.vert", + AppCommon::shaderPath + "Reflect.frag"); + + // initialize sensor stuff before loading geotiff + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Center of theatre, Origin", + 47, + 31, + 59.461, + 7, + 43, + 19.446, + 282.6)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Treppe Tempel", + 47, + 31, + 58.933, + 7, + 43, + 16.799, + 290.5 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Abzweigung (Dolendeckel)", + 47, + 31, + 57.969, + 7, + 43, + 17.946, + 286.5 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Marker bei Tempel", + 47, + 31, + 59.235, + 7, + 43, + 15.161, + 293.1 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Theater 1. Rang Zugang Ost", + 47, + 31, + 59.698, + 7, + 43, + 20.518, + 291.0 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Theater 1. Rang Nord", + 47, + 32, + 0.216, + 7, + 43, + 19.173, + 291.0 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. allowed distance to origin + AppCommon::devLoc.improveOrigin(false); // No autom. origin improvement + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/augst/DTM-Theater-Tempel-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARAugustaTmpTht::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, + "matVideoBkgd", + gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, + "matVideoBkgdSM", + gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + // Setup the camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 50, -150); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(400); + cam1->focalDist(150); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 1.0f); + sunLight->translate(-42, 10, 13); + sunLight->powers(1.0f, 1.5f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1, SLVec2i(2048, 2048), 4); + sunLight->shadowMap()->cascadesFactor(3.0); + // Old stanard single map shadow map + // sunLight->createShadowMap(-100, 250, SLVec2f(210, 180), SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.001f); + sunLight->shadowMaxBias(0.001f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Rotate to the true geographic rotation + _thtAndTmp->rotate(16.7f, 0, 1, 0, TS_parent); + + // Let the video shine through on some objects without shadow mapping + SLNode* tmpUnderground = _thtAndTmp->findChild("TmpUnderground"); + if (tmpUnderground) tmpUnderground->setMeshMat(matVideoBkgd, true); + SLNode* thtUnderground = _thtAndTmp->findChild("ThtUnderground"); + if (thtUnderground) thtUnderground->setMeshMat(matVideoBkgd, true); + + // Let the video shine through on some objects with shadow mapping + SLNode* tmpFloor = _thtAndTmp->findChild("TmpFloor"); + if (tmpFloor) tmpFloor->setMeshMat(matVideoBkgdSM, true); + + SLNode* terrain = _thtAndTmp->findChild("Terrain"); + if (terrain) + { + terrain->setMeshMat(matVideoBkgdSM, true); + terrain->castsShadows(false); + } + SLNode* thtFrontTerrain = _thtAndTmp->findChild("ThtFrontTerrain"); + if (thtFrontTerrain) + { + thtFrontTerrain->setMeshMat(matVideoBkgdSM, true); + thtFrontTerrain->castsShadows(false); + } + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + // Set some ambient light + _thtAndTmp->updateMeshMat([](SLMaterial* m) + { m->ambient(SLCol4f(.25f, .25f, .25f)); }, + true); + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_thtAndTmp); + scene->addChild(cam1); + + // Level of Detail switch for Temple and Theater + SLNode* tmpAltar = _thtAndTmp->findChild("TmpAltar"); + SLNode* tmpL1 = _thtAndTmp->findChild("Tmp-L1"); + SLNode* tmpL2 = _thtAndTmp->findChild("Tmp-L2"); + SLNode* thtL1 = _thtAndTmp->findChild("Tht-L1"); + SLNode* thtL2 = _thtAndTmp->findChild("Tht-L2"); + thtL1->drawBits()->set(SL_DB_HIDDEN, false); + thtL2->drawBits()->set(SL_DB_HIDDEN, true); + tmpL1->drawBits()->set(SL_DB_HIDDEN, false); + tmpL2->drawBits()->set(SL_DB_HIDDEN, true); + + // Add level of detail switch callback lambda + cam1->onCamUpdateCB([=](SLSceneView* sv) + { + SLVec3f posCam = sv->camera()->updateAndGetWM().translation(); + SLVec3f posAlt = tmpAltar->updateAndGetWM().translation(); + SLVec3f distCamAlt = posCam - posAlt; + float tmpDist = distCamAlt.length(); + float thtDist = posCam.length(); + + // If the temple is closer than the theater activate level 1 and deactivate level 2 + if (tmpDist < thtDist) + { + thtL1->drawBits()->set(SL_DB_HIDDEN, true); + thtL2->drawBits()->set(SL_DB_HIDDEN, false); + tmpL1->drawBits()->set(SL_DB_HIDDEN, false); + tmpL2->drawBits()->set(SL_DB_HIDDEN, true); + } + else + { + thtL1->drawBits()->set(SL_DB_HIDDEN, false); + thtL2->drawBits()->set(SL_DB_HIDDEN, true); + tmpL1->drawBits()->set(SL_DB_HIDDEN, true); + tmpL2->drawBits()->set(SL_DB_HIDDEN, false); + } }); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.h new file mode 100644 index 00000000..3d3c3fe2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAugustaTmpTht.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneErlebARAugustaTmp.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_AUGUSTATMPTHT_H +#define APPDEMOSCENEERLEBAR_AUGUSTATMPTHT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model for Augusta Raurica Tempel & Theater. +class AppDemoSceneErlebARAugustaTmpTht : public SLScene +{ +public: + AppDemoSceneErlebARAugustaTmpTht(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _thtAndTmp; + SLGLTexture* _cubemap; + SLGLProgram* _spRefl; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.cpp new file mode 100644 index 00000000..84b089a6 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.cpp @@ -0,0 +1,173 @@ +/** + * \file AppDemoSceneErlebARAventicumAmphitheater.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARAventicumAmphitheater::AppDemoSceneErlebARAventicumAmphitheater() + : SLScene("Aventicum Theatre AR") +{ + info("Aventicum Theatre AR"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARAventicumAmphitheater::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_theater, + AppCommon::dataPath + + "erleb-AR/models/avenches/avenches-amphitheater.gltf"); + + // initialize sensor stuff before loading the geotiff + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena Centre, Origin", 46, 52, 51.685, 7, 2, 33.458, 461.4)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Entrance East, Manhole Cover", 46, 52, 52.344, 7, 2, 37.600, 461.4 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena, Sewer Cover West", 46, 52, 51.484, 7, 2, 32.307, 461.3 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena, Sewer Cover East", 46, 52, 51.870, 7, 2, 34.595, 461.1 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Stand South, Sewer Cover", 46, 52, 50.635, 7, 2, 34.099, 471.7 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Stand West, Sewer Cover", 46, 52, 51.889, 7, 2, 31.567, 471.7 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. Distanz. zum Nullpunkt + AppCommon::devLoc.improveOrigin(false); // Keine autom. Verbesserung vom Origin + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/avenches/DTM-Aventicum-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARAventicumAmphitheater::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, "matVideoBkgd", gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, "matVideoBkgdSM", gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + // Setup the camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 50, -150); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(400); + cam1->focalDist(150); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 1.0f); + sunLight->powers(1.0f, 1.5f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->translation(0, 1, 0); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1, SLVec2i(2048, 2048), 4); + sunLight->shadowMap()->cascadesFactor(3.0); + // sunLight->createShadowMap(-70, 70, SLVec2f(140, 100), SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.001f); + sunLight->shadowMaxBias(0.003f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Rotate to the true geographic rotation + _theater->rotate(13.25f, 0, 1, 0, TS_parent); + + // Let the video shine through some objects + _theater->findChild("Tht-Aussen-Untergrund")->setMeshMat(matVideoBkgd, true); + _theater->findChild("Tht-Eingang-Ost-Boden")->setMeshMat(matVideoBkgdSM, true); + _theater->findChild("Tht-Arenaboden")->setMeshMat(matVideoBkgdSM, true); + // amphiTheatre->findChild("Tht-Aussen-Terrain")->setMeshMat(matVideoBkgdSM, true); + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_theater); + scene->addChild(cam1); + + // initialize sensor stuff + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena Centre, Origin", 46, 52, 51.685, 7, 2, 33.458, 461.4)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Entrance East, Manhole Cover", 46, 52, 52.344, 7, 2, 37.600, 461.4 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena, Sewer Cover West", 46, 52, 51.484, 7, 2, 32.307, 461.3 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Arena, Sewer Cover East", 46, 52, 51.870, 7, 2, 34.595, 461.1 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Stand South, Sewer Cover", 46, 52, 50.635, 7, 2, 34.099, 471.7 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Stand West, Sewer Cover", 46, 52, 51.889, 7, 2, 31.567, 471.7 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. Distanz. zum Nullpunkt + AppCommon::devLoc.improveOrigin(false); // Keine autom. Verbesserung vom Origin + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.h new file mode 100644 index 00000000..7d2adbe5 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumAmphitheater.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneErlebARAventicumAmphitheater.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_AVENTICUMAMPHI_H +#define APPDEMOSCENEERLEBAR_AVENTICUMAMPHI_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model for Aventicum Amphitheater. +class AppDemoSceneErlebARAventicumAmphitheater : public SLScene +{ +public: + AppDemoSceneErlebARAventicumAmphitheater(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _theater; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.cpp new file mode 100644 index 00000000..5e329993 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.cpp @@ -0,0 +1,153 @@ +/** + * \file AppDemoSceneErlebARAventicumCigognier.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARAventicumCigognier::AppDemoSceneErlebARAventicumCigognier() + : SLScene("Aventicum Cigognier AR") +{ + info("Aventicum Cigognier AR"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARAventicumCigognier::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_cigognier, + AppCommon::dataPath + + "erleb-AR/models/avenches/avenches-cigognier.gltf"); + + // initialize sensor stuff before loading the geotiff + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Center of place, Origin", 46, 52, 53.245, 7, 2, 47.198, 450.9)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("At the altar", 46, 52, 53.107, 7, 2, 47.498, 450.9 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Old AR viewer", 46, 52, 53.666, 7, 2, 48.316, 451.0 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Temple Entrance in hall", 46, 52, 54.007, 7, 2, 45.702, 453.0 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. allowed distance from origin + AppCommon::devLoc.improveOrigin(false); // No auto improvement from + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/avenches/DTM-Aventicum-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARAventicumCigognier::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, "matVideoBkgd", gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, "matVideoBkgdSM", gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + // Setup the camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 50, -150); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(400); + cam1->focalDist(150); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 1.0f); + sunLight->powers(1.0f, 1.0f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->translation(0, 10, 0); + sunLight->lookAt(10, 0, 10); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1, SLVec2i(2048, 2048), 4); + sunLight->shadowMap()->cascadesFactor(3.0); + // sunLight->createShadowMap(-70, 120, SLVec2f(150, 150), SLVec2i(2048, 2048)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.001f); + sunLight->shadowMaxBias(0.003f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Rotate to the true geographic rotation + _cigognier->rotate(-36.52f, 0, 1, 0, TS_parent); + + // Let the video shine through some objects + _cigognier->findChild("Tmp-Sol-Pelouse")->setMeshMat(matVideoBkgdSM, true); + _cigognier->findChild("Tmp-Souterrain")->setMeshMat(matVideoBkgd, true); + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_cigognier); + scene->addChild(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(0, cam1->translationWS().y, 0); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.h new file mode 100644 index 00000000..d5cc15bf --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumCigognier.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneErlebARAventicumCigognier.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_AVENTICUMCIG_H +#define APPDEMOSCENEERLEBAR_AVENTICUMCIG_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model for Aventicum Cigognier. +class AppDemoSceneErlebARAventicumCigognier : public SLScene +{ +public: + AppDemoSceneErlebARAventicumCigognier(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _cigognier; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.cpp new file mode 100644 index 00000000..aa949f58 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.cpp @@ -0,0 +1,157 @@ +/** + * \file AppDemoSceneErlebARAventicumTheater.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARAventicumTheater::AppDemoSceneErlebARAventicumTheater() + : SLScene("Aventicum Theatre AR") +{ + info("Aventicum Theatre AR"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARAventicumTheater::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_theater, + AppCommon::dataPath + + "erleb-AR/models/avenches/avenches-theater.gltf"); + + // initialize sensor stuff before loading the geotiff + // https://map.geo.admin.ch/?lang=de&topic=ech&bgLayer=ch.swisstopo.swissimage&layers=ch.swisstopo.zeitreihen,ch.bfs.gebaeude_wohnungs_register,ch.bav.haltestellen-oev,ch.swisstopo.swisstlm3d-wanderwege&layers_opacity=1,1,1,0.8&layers_visibility=false,false,false,false&layers_timestamp=18641231,,,&E=2570281&N=1192204&zoom=13&crosshair=marker + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Center of theatre, Origin", 46, 52, 49.041, 7, 2, 55.543, 454.9)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("On the stage", 46, 52, 49.221, 7, 2, 55.206, 455.5 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("At the tree (N-E)", 46, 52, 50.791, 7, 2, 55.960, 455.5 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Over the entrance (S)", 46, 52, 48.162, 7, 2, 56.097, 464.0 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("At the 3rd tree (S-W)", 46, 52, 48.140, 7, 2, 51.506, 455.0 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. Distanz. zum Nullpunkt + AppCommon::devLoc.improveOrigin(false); // Keine autom. Verbesserung vom Origin + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/avenches/DTM-Aventicum-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARAventicumTheater::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, "matVideoBkgd", gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, "matVideoBkgdSM", gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + // Setup the camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 50, -150); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(300); + cam1->focalDist(150); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 1.0f); + sunLight->powers(1.0f, 1.0f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->translation(0, 1, 0); + + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1, SLVec2i(2048, 2048), 4); + sunLight->shadowMap()->cascadesFactor(3.0); + // sunLight->createShadowMap(-80, 100, SLVec2f(130, 130), SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.001f); + sunLight->shadowMaxBias(0.001f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Rotate to the true geographic rotation + _theater->rotate(-36.7f, 0, 1, 0, TS_parent); + + // Let the video shine through some objects + _theater->findChild("Tht-Rasen")->setMeshMat(matVideoBkgdSM, true); + _theater->findChild("Tht-Untergrund")->setMeshMat(matVideoBkgd, true); + _theater->findChild("Tht-Boden")->setMeshMat(matVideoBkgdSM, true); + _theater->findChild("Tht-Boden")->setDrawBitsRec(SL_DB_WITHEDGES, true); + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_theater); + scene->addChild(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.h new file mode 100644 index 00000000..44408e82 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARAventicumTheater.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneErlebARAventicumTheater.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_AVENTICUMTHT_H +#define APPDEMOSCENEERLEBAR_AVENTICUMTHT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model for Aventicum Theater. +class AppDemoSceneErlebARAventicumTheater : public SLScene +{ +public: + AppDemoSceneErlebARAventicumTheater(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _theater; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.cpp new file mode 100644 index 00000000..29e60828 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.cpp @@ -0,0 +1,263 @@ +/** + * \file AppDemoSceneErlebARBernChristoffel.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARBernChristoffel::AppDemoSceneErlebARBernChristoffel() + : SLScene("Christoffel Tower AR") +{ + info("Augmented Reality Christoffel Tower"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARBernChristoffel::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + al.addNodeToLoad(_bern, + AppCommon::dataPath + + "erleb-AR/models/bern/bern-christoffel.gltf"); + al.addTextureToLoad(_cubemap, + AppCommon::dataPath + "erleb-AR/models/bern/Sea1+X1024.jpg", + AppCommon::dataPath + "erleb-AR/models/bern/Sea1-X1024.jpg", + AppCommon::dataPath + "erleb-AR/models/bern/Sea1+Y1024.jpg", + AppCommon::dataPath + "erleb-AR/models/bern/Sea1-Y1024.jpg", + AppCommon::dataPath + "erleb-AR/models/bern/Sea1+Z1024.jpg", + AppCommon::dataPath + "erleb-AR/models/bern/Sea1-Z1024.jpg"); + al.addProgramToLoad(_spRefl, + AppCommon::shaderPath + "Reflect.vert", + AppCommon::shaderPath + "Reflect.frag"); + + // initialize sensor stuff before loading the geotiff + AppCommon::devLoc.originLatLonAlt(46.94763, 7.44074, 542.2); // Loeb Ecken + AppCommon::devLoc.defaultLatLonAlt(46.94841, 7.43970, 542.2 + 1.7); // Bahnhof Ausgang in Augenhöhe + AppCommon::devLoc.nameLocations().push_back(SLLocation("Loeb Ecken, Origin", + 46, + 56, + 51.451, + 7, + 26, + 26.676, + 542.2)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Milchgässli, Velomarkierung, (N)", + 46, + 56, + 54.197, + 7, + 26, + 23.366, + 541.2 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Spitalgasse (E)", + 46, + 56, + 51.703, + 7, + 26, + 27.565, + 542.1 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Tramhaltestelle UBS, eckiger Schachtd. (S)", + 46, + 56, + 50.366, + 7, + 26, + 24.544, + 542.3 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Ecke Schauplatz-Christoffelgasse (S)", + 46, + 56, + 50.139, + 7, + 26, + 27.225, + 542.1 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Bubenbergplatz (S)", + 46, + 56, + 50.304, + 7, + 26, + 22.113, + 542.4 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Heiliggeistkirche (Dole, N-W)", + 46, + 56, + 53.500, + 7, + 26, + 25.499, + 541.6 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. Distanz. zum Loeb Ecken + AppCommon::devLoc.improveOrigin(false); // Keine autom. Verbesserung vom Origin + AppCommon::devLoc.useOriginAltitude(true); + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/bern/DEM-Bern-2600_1199-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARBernChristoffel::assemble(SLAssetManager* am, SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, + "matVideoBkgd", + gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, + "matVideoBkgdSM", + gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 0); + cam1->lookAt(-10, 2, 0); + cam1->clipNear(1); + cam1->clipFar(700); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 2.0f); + sunLight->translate(-44.89f, 18.05f, -26.07f); + sunLight->powers(1.0f, 1.5f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1, + SLVec2i(2048, 2048), + 4); + sunLight->shadowMap()->cascadesFactor(3.0); + // sunLight->createShadowMap(-100, 150, SLVec2f(200, 150), SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.001f); + sunLight->shadowMaxBias(0.003f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Make city with hard edges and without shadow mapping + SLNode* Umg = _bern->findChild("Umgebung-Swisstopo"); + Umg->setMeshMat(matVideoBkgd, true); + Umg->setDrawBitsRec(SL_DB_WITHEDGES, true); + Umg->castsShadows(false); + + // Hide some objects + _bern->findChild("Baldachin-Glas")->drawBits()->set(SL_DB_HIDDEN, true); + _bern->findChild("Baldachin-Stahl")->drawBits()->set(SL_DB_HIDDEN, true); + + // Set the video background shader on the baldachin and the ground with shadow mapping + _bern->findChild("Baldachin-Stahl")->setMeshMat(matVideoBkgdSM, true); + _bern->findChild("Baldachin-Glas")->setMeshMat(matVideoBkgdSM, true); + _bern->findChild("Chr-Alt-Stadtboden")->setMeshMat(matVideoBkgdSM, true); + _bern->findChild("Chr-Neu-Stadtboden")->setMeshMat(matVideoBkgdSM, true); + + // Hide the new (last) version of the Christoffel tower + _bern->findChild("Chr-Neu")->drawBits()->set(SL_DB_HIDDEN, true); + + // Material for water + SLMaterial* matWater = new SLMaterial(am, + "water", + SLCol4f::BLACK, + SLCol4f::BLACK, + 100, + 0.1f, + 0.9f, + 1.5f); + matWater->translucency(1000); + matWater->transmissive(SLCol4f::WHITE); + matWater->addTexture(_cubemap); + matWater->program(_spRefl); + _bern->findChild("Chr-Wasser")->setMeshMat(matWater, true); + + // Add axis object a world origin (Loeb Ecke) + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + // Bridge rotation animation + SLNode* bridge = _bern->findChild("Chr-Alt-Tor"); + SLAnimation* bridgeAnim = animManager().createNodeAnimation("Gate animation", + 8.0f, + true, + EC_inOutQuint, + AL_pingPongLoop); + bridgeAnim->createNodeAnimTrackForRotation(bridge, 90, bridge->forwardOS()); + + // Gate translation animation + SLNode* gate = _bern->findChild("Chr-Alt-Gatter"); + SLAnimation* gateAnim = animManager().createNodeAnimation("Gatter Animation", + 8.0f, + true, + EC_inOutQuint, + AL_pingPongLoop); + gateAnim->createNodeAnimTrackForTranslation(gate, SLVec3f(0.0f, -3.6f, 0.0f)); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_bern); + scene->addChild(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.h new file mode 100644 index 00000000..f86b2b82 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARBernChristoffel.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneErlebARBernChristoffel.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_CHRISTOFFEL_H +#define APPDEMOSCENEERLEBAR_CHRISTOFFEL_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model Christoffel Tower in Bern. +class AppDemoSceneErlebARBernChristoffel : public SLScene +{ +public: + AppDemoSceneErlebARBernChristoffel(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _bern; + SLGLTexture* _cubemap; + SLGLProgram* _spRefl; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.cpp new file mode 100644 index 00000000..0f1cf954 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.cpp @@ -0,0 +1,152 @@ +/** + * \file AppDemoSceneErlebARBielBFH.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARBielBFH::AppDemoSceneErlebARBielBFH() + : SLScene("Biel-BFH AR") +{ + info("Augmented Reality at Biel-BFH"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARBielBFH::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_bfh, + AppCommon::dataPath + + "erleb-AR/models/biel/Biel-BFH-Rolex.gltf"); + + al.addProgramToLoad(_spVideoBackground, + AppCommon::shaderPath + "PerPixTmBackground.vert", + AppCommon::shaderPath + "PerPixTmBackground.frag"); + + // initialize sensor stuff before loading the geotiff + AppCommon::devLoc.originLatLonAlt(47.14271, 7.24337, 488.2); // Ecke Giosa + AppCommon::devLoc.defaultLatLonAlt(47.14260, 7.24310, 488.7 + 1.7); // auf Parkplatz + AppCommon::devLoc.locMaxDistanceM(1000.0f); + AppCommon::devLoc.improveOrigin(false); + AppCommon::devLoc.useOriginAltitude(true); + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/biel/DEM_Biel-BFH_WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARBielBFH::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* matVideoBkgd = new SLMaterial(am, + "matVideoBkgd", + gVideoTexture, + nullptr, + nullptr, + nullptr, + _spVideoBackground); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 0); + cam1->lookAt(-10, 2, 0); + cam1->clipNear(1); + cam1->clipFar(1000); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 5.0f); + sunLight->powers(1.0f, 1.0f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMap(-100, + 150, + SLVec2f(150, 150), + SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + _bfh->setMeshMat(matVideoBkgd, true); + + // Make terrain a video shine trough + // _bfh->findChild("Terrain")->setMeshMat(matVideoBkgd, true); + + /* Make buildings transparent + SLNode* buildings = _bfh->findChild("Buildings"); + SLNode* roofs = _bfh->findChild("Roofs"); + auto updateTranspFnc = [](SLMaterial* m) {m->kt(0.5f);}; + buildings->updateMeshMat(updateTranspFnc, true); + roofs->updateMeshMat(updateTranspFnc, true); + + // Set ambient on all child nodes + _bfh->updateMeshMat([](SLMaterial* m) { m->ambient(SLCol4f(.2f, .2f, .2f)); }, true); + */ + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->scale(2); + axis->rotate(-90, 1, 0, 0); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_bfh); + scene->addChild(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); + sv->drawBits()->on(SL_DB_ONLYEDGES); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.h new file mode 100644 index 00000000..0cd58591 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARBielBFH.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneErlebARBielBFH.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_BIELBFH_H +#define APPDEMOSCENEERLEBAR_BIELBFH_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model in Biel at BFH. +class AppDemoSceneErlebARBielBFH : public SLScene +{ +public: + AppDemoSceneErlebARBielBFH(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _bfh; + SLGLProgram* _spVideoBackground; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.cpp b/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.cpp new file mode 100644 index 00000000..de85a4f5 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.cpp @@ -0,0 +1,159 @@ +/** + * \file AppDemoSceneErlebARSutz.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneErlebARSutz::AppDemoSceneErlebARSutz() + : SLScene("Sutz AR") +{ + info("Augmented Reality at Sutz"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneErlebARSutz::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addNodeToLoad(_sutz, + AppCommon::dataPath + + "erleb-AR/models/sutz/Sutz-Kirchrain18.gltf"); + + al.addProgramToLoad(_spVideoBackground, + AppCommon::shaderPath + "PerPixTmBackground.vert", + AppCommon::shaderPath + "PerPixTmBackground.frag"); + + // initialize sensor stuff before loading the geotiff + // Go to https://map.geo.admin.ch and choose your origin and default point + AppCommon::devLoc.useOriginAltitude(false); + AppCommon::devLoc.originLatLonAlt(47.10600, 7.21772, 434.4f); // Corner Carport + AppCommon::devLoc.defaultLatLonAlt(47.10598, 7.21757, 433.9f + 1.7); // In the street + AppCommon::devLoc.nameLocations().push_back(SLLocation("Corner Carport, Origin", 47, 6, 21.609, 7, 13, 3.788, 434.4)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Einfahrt (Dolendeckel)", 47, 6, 21.639, 7, 13, 2.764, 433.6 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Elektrokasten, Brunnenweg", 47, 6, 21.044, 7, 13, 4.920, 438.4 + 1.7)); + AppCommon::devLoc.nameLocations().push_back(SLLocation("Sitzbänkli am See", 47, 6, 24.537, 7, 13, 2.766, 431.2 + 1.7)); + AppCommon::devLoc.originLatLonAlt(AppCommon::devLoc.nameLocations()[0].posWGS84LatLonAlt); + AppCommon::devLoc.activeNamedLocation(1); // This sets the location 1 as defaultENU + AppCommon::devLoc.locMaxDistanceM(1000.0f); // Max. Distanz. zum Nullpunkt + AppCommon::devLoc.improveOrigin(false); // Keine autom. Verbesserung vom Origin + AppCommon::devLoc.hasOrigin(true); + AppCommon::devLoc.offsetMode(LOM_twoFingerY); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devRot.offsetMode(ROM_oneFingerX); + + // This loads the DEM file and overwrites the altitude of originLatLonAlt and defaultLatLonAlt + al.addGeoTiffToLoad(AppCommon::devLoc, + AppCommon::dataPath + + "erleb-AR/models/sutz/Sutz-Kirchrain18-DEM-WGS84.tif"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneErlebARSutz::assemble(SLAssetManager* am, SLSceneView* sv) +{ + gVideoTexture->texType(TT_videoBkgd); + + // Create see through video background material without shadow mapping + SLMaterial* matVideoBkgd = new SLMaterial(am, "matVideoBkgd", gVideoTexture); + matVideoBkgd->reflectionModel(RM_Custom); + + // Create see through video background material with shadow mapping + SLMaterial* matVideoBkgdSM = new SLMaterial(am, "matVideoBkgdSM", gVideoTexture); + matVideoBkgdSM->reflectionModel(RM_Custom); + matVideoBkgdSM->ambient(SLCol4f(0.6f, 0.6f, 0.6f)); + matVideoBkgdSM->getsShadows(true); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 5.0f); + sunLight->powers(1.0f, 1.0f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->translation(0, 10, 0); + sunLight->lookAt(10, 0, 10); + sunLight->doSunPowerAdaptation(true); + sunLight->createsShadows(true); + sunLight->createShadowMap(-100, 150, SLVec2f(150, 150), SLVec2i(4096, 4096)); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.calculateSolarAngles(AppCommon::devLoc.originLatLonAlt(), + std::time(nullptr)); + + // Setup the camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 50, -150); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(300); + cam1->focalDist(150); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Rotate to the true geographic rotation + // Nothing to do here because the model is north up + + // Let the video shine through some objects + _sutz->findChild("Terrain")->setMeshMat(matVideoBkgdSM, true); + + // Make buildings transparent with edges + SLNode* buildings = _sutz->findChild("Buildings"); + buildings->setMeshMat(matVideoBkgd, true); + buildings->setDrawBitsRec(SL_DB_WITHEDGES, true); + + // Add axis object a world origin + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->rotate(-90, 1, 0, 0); + axis->castsShadows(false); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(axis); + scene->addChild(_sutz); + scene->addChild(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + AppCommon::devLoc.isUsed(true); + AppCommon::devRot.isUsed(true); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + AppCommon::devLoc.isUsed(false); + AppCommon::devRot.isUsed(false); + SLVec3d pos_d = AppCommon::devLoc.defaultENU() - AppCommon::devLoc.originENU(); + SLVec3f pos_f((SLfloat)pos_d.x, (SLfloat)pos_d.y, (SLfloat)pos_d.z); + cam1->translation(pos_f); + cam1->focalDist(pos_f.length()); + cam1->lookAt(SLVec3f::ZERO); + cam1->camAnim(SLCamAnim::CA_turntableYUp); +#endif + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.h b/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.h new file mode 100644 index 00000000..ecdcc9e8 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneErlebARSutz.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneErlebARSutz.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEERLEBAR_SUTZ_H +#define APPDEMOSCENEERLEBAR_SUTZ_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ErlebAR model in Sutz +class AppDemoSceneErlebARSutz : public SLScene +{ +public: + AppDemoSceneErlebARSutz(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _sutz; + SLGLProgram* _spVideoBackground; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneFigure.cpp b/apps/app_demo/source/scenes/AppDemoSceneFigure.cpp new file mode 100644 index 00000000..5d5c0188 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneFigure.cpp @@ -0,0 +1,220 @@ +/** + * \file AppDemoSceneFigure.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneFigure::AppDemoSceneFigure() : SLScene("Hierarchical Figure Test") +{ + info("Hierarchical scenegraph with multiple subgroups in the figure. " + "The goal is design a figure with hierarchical transforms containing only rotations and translations. " + "You can see the hierarchy better in the Scenegraph window. In there the nodes are white and the meshes yellow. " + "You can view the axis aligned bounding boxes with key B and the nodes origin and axis with key X."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneFigure::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneFigure::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create textures and materials + SLMaterial* m1 = new SLMaterial(am, "m1", SLCol4f::BLACK, SLCol4f::WHITE, 128, 0.2f, 0.8f, 1.5f); + SLMaterial* m2 = new SLMaterial(am, "m2", SLCol4f::WHITE * 0.3f, SLCol4f::WHITE, 128, 0.5f, 0.0f, 1.0f); + + SLuint res = 20; + SLMesh* rectangle = new SLRectangle(am, SLVec2f(-5, -5), SLVec2f(5, 5), res, res, "rectangle", m2); + SLNode* floorRect = new SLNode(rectangle); + floorRect->rotate(90, -1, 0, 0); + floorRect->translate(0, 0, -5.5f); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(-7, 2, 7); + cam1->lookAt(0, -2, 0); + cam1->focalDist(10); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().colors(SLCol4f(0.7f, 0.6f, 1.0f), + SLCol4f(0.1f, 0.4f, 0.8f)); + + SLLightSpot* light1 = new SLLightSpot(am, this, 5, 0, 5, 0.5f); + light1->powers(0.2f, 0.9f, 0.9f); + light1->attenuation(1, 0, 0); + + SLNode* figure = BuildFigureGroup(am, this, m1, true); + + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + scene->addChild(light1); + scene->addChild(cam1); + scene->addChild(floorRect); + scene->addChild(figure); + + sv->camera(cam1); +} + +//----------------------------------------------------------------------------- +//! Build a hierarchical figurine with arms and legs +SLNode* AppDemoSceneFigure::BuildFigureGroup(SLAssetManager* am, + SLScene* s, + SLMaterial* mat, + SLbool withAnimation) +{ + SLNode* cyl; + SLuint res = 16; + + // Feet + SLNode* feet = new SLNode("feet group (T13,R6)"); + feet->addMesh(new SLSphere(am, 0.2f, 16, 16, "ankle", mat)); + SLNode* feetbox = new SLNode(new SLBox(am, + -0.2f, + -0.1f, + 0.0f, + 0.2f, + 0.1f, + 0.8f, + "foot mesh", + mat), + "feet (T14)"); + feetbox->translate(0.0f, -0.25f, -0.15f, TS_object); + feet->addChild(feetbox); + feet->translate(0.0f, 0.0f, 1.6f, TS_object); + feet->rotate(-90.0f, 1.0f, 0.0f, 0.0f); + + // Assemble low leg + SLNode* leglow = new SLNode("low leg group (T11, R5)"); + leglow->addMesh(new SLSphere(am, 0.3f, res, res, "knee mesh", mat)); + cyl = new SLNode(new SLCylinder(am, + 0.2f, + 1.4f, + 1, + res, + false, + false, + "shin mesh", + mat), + "shin (T12)"); + cyl->translate(0.0f, 0.0f, 0.2f, TS_object); + leglow->addChild(cyl); + leglow->addChild(feet); + leglow->translate(0.0f, 0.0f, 1.27f, TS_object); + leglow->rotate(0, 1.0f, 0.0f, 0.0f); + + // Assemble leg + SLNode* leg = new SLNode("leg group"); + leg->addMesh(new SLSphere(am, 0.4f, res, res, "hip joint mesh", mat)); + cyl = new SLNode(new SLCylinder(am, + 0.3f, + 1.0f, + 1, + res, + false, + false, + "thigh mesh", + mat), + "thigh (T10)"); + cyl->translate(0.0f, 0.0f, 0.27f, TS_object); + leg->addChild(cyl); + leg->addChild(leglow); + + // Assemble left & right leg + SLNode* legLeft = new SLNode("left leg group (T8)"); + legLeft->translate(-0.4f, 0.0f, 2.2f, TS_object); + legLeft->addChild(leg); + SLNode* legRight = new SLNode("right leg group (T9)"); + legRight->translate(0.4f, 0.0f, 2.2f, TS_object); + legRight->addChild(leg->copyRec()); + + // Assemble low arm + SLNode* armlow = new SLNode("low arm group (T6,R4)"); + armlow->addMesh(new SLSphere(am, 0.2f, 16, 16, "elbow mesh", mat)); + cyl = new SLNode(new SLCylinder(am, 0.15f, 1.0f, 1, res, true, false, "low arm mesh", mat), "low arm (T7)"); + cyl->translate(0.0f, 0.0f, 0.14f, TS_object); + armlow->addChild(cyl); + armlow->translate(0.0f, 0.0f, 1.2f, TS_object); + armlow->rotate(45, -1.0f, 0.0f, 0.0f); + + // Assemble arm + SLNode* arm = new SLNode("arm group"); + arm->addMesh(new SLSphere(am, 0.3f, 16, 16, "shoulder mesh", mat)); + cyl = new SLNode(new SLCylinder(am, 0.2f, 1.0f, 1, res, false, false, "upper arm mesh", mat), "upper arm (T5)"); + cyl->translate(0.0f, 0.0f, 0.2f, TS_object); + arm->addChild(cyl); + arm->addChild(armlow); + + // Assemble left & right arm + SLNode* armLeft = new SLNode("left arm group (T3,R2)"); + armLeft->translate(-1.1f, 0.0f, 0.3f, TS_object); + armLeft->rotate(10, -1, 0, 0); + armLeft->addChild(arm); + SLNode* armRight = new SLNode("right arm group (T4,R3)"); + armRight->translate(1.1f, 0.0f, 0.3f, TS_object); + armRight->rotate(-60, -1, 0, 0); + armRight->addChild(arm->copyRec()); + + // Assemble head & neck + SLNode* head = new SLNode(new SLSphere(am, 0.5f, res, res, "head mesh", mat), "head (T1)"); + head->translate(0.0f, 0.0f, -0.7f, TS_object); + SLSphere* eye = new SLSphere(am, 0.06f, res, res, "eye mesh", mat); + SLNode* eyeL = new SLNode(eye, SLVec3f(-0.15f, 0.48f, 0), "eyeL (T1.1)"); + SLNode* eyeR = new SLNode(eye, SLVec3f(0.15f, 0.48f, 0), "eyeR (T1.2)"); + head->addChild(eyeL); + head->addChild(eyeR); + SLNode* neck = new SLNode(new SLCylinder(am, 0.25f, 0.3f, 1, res, false, false, "neck mesh", mat), "neck (T2)"); + neck->translate(0.0f, 0.0f, -0.3f, TS_object); + + // Assemble figure Left + SLNode* figure = new SLNode("figure group (R1)"); + figure->addChild(new SLNode(new SLBox(am, -0.8f, -0.4f, 0.0f, 0.8f, 0.4f, 2.0f, "chest mesh", mat), "chest")); + figure->addChild(head); + figure->addChild(neck); + figure->addChild(armLeft); + figure->addChild(armRight); + figure->addChild(legLeft); + figure->addChild(legRight); + figure->rotate(90, 1, 0, 0); + + // Add animations for left leg + if (withAnimation) + { + legLeft = figure->findChild("left leg group (T8)"); + legLeft->rotate(30, -1, 0, 0); + SLAnimation* anim = s->animManager().createNodeAnimation("figure animation", 2.0f, true, EC_inOutQuint, AL_pingPongLoop); + anim->createNodeAnimTrackForRotation(legLeft, 60, SLVec3f(1, 0, 0)); + + SLNode* legLowLeft = legLeft->findChild("low leg group (T11, R5)"); + anim->createNodeAnimTrackForRotation(legLowLeft, 40, SLVec3f(1, 0, 0)); + + SLNode* feetLeft = legLeft->findChild("feet group (T13,R6)"); + anim->createNodeAnimTrackForRotation(feetLeft, 40, SLVec3f(1, 0, 0)); + + armLeft = figure->findChild("left arm group (T3,R2)"); + armLeft->rotate(-45, -1, 0, 0); + anim->createNodeAnimTrackForRotation(armLeft, -60, SLVec3f(1, 0, 0)); + + armRight = figure->findChild("right arm group (T4,R3)"); + armRight->rotate(45, -1, 0, 0); + anim->createNodeAnimTrackForRotation(armRight, 60, SLVec3f(1, 0, 0)); + } + + return figure; +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneFigure.h b/apps/app_demo/source/scenes/AppDemoSceneFigure.h new file mode 100644 index 00000000..db00e05f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneFigure.h @@ -0,0 +1,33 @@ +/** + * \file AppDemoSceneFigure.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEFIGURE_H +#define APPDEMOSCENEFIGURE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for the figure scene +class AppDemoSceneFigure : public SLScene +{ +public: + AppDemoSceneFigure(); + void registerAssetsToLoad(SLAssetLoader& al) override; + void assemble(SLAssetManager* am, SLSceneView* sv) override; + static SLNode* BuildFigureGroup(SLAssetManager* am, + SLScene* s, + SLMaterial* mat, + SLbool withAnimation); +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneFrustum.cpp b/apps/app_demo/source/scenes/AppDemoSceneFrustum.cpp new file mode 100644 index 00000000..fa6e64b4 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneFrustum.cpp @@ -0,0 +1,102 @@ +/** + * \file AppDemoSceneFrustum.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include "SL.h" +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneFrustum::AppDemoSceneFrustum() : SLScene("Frustum Culling Test Scene") +{ + info("View frustum culling: Only objects in view are rendered. " + "You can turn view frustum culling on/off in the menu Preferences or with the key F."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneFrustum::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_tex, + AppCommon::texturePath + "earth1024_C.jpg"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneFrustum::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // create texture + SLMaterial* mat1 = new SLMaterial(am, "mat1", _tex); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 0, 1); + cam1->lookAt(0, 0, 0); + cam1->focalDist(5); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 10, + 10, + 10, + 0.3f); + light1->powers(0.2f, 0.8f, 1.0f); + light1->attenuation(1, 0, 0); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(cam1); + scene->addChild(light1); + + // add one single sphere in the center + SLuint res = 16; + SLNode* sphere = new SLNode(new SLSphere(am, + 0.15f, + res, + res, + "mySphere", + mat1)); + scene->addChild(sphere); + + // create spheres around the center sphere + SLint size = 20; + for (SLint iZ = -size; iZ <= size; ++iZ) + { + for (SLint iY = -size; iY <= size; ++iY) + { + for (SLint iX = -size; iX <= size; ++iX) + { + if (iX != 0 || iY != 0 || iZ != 0) + { + SLNode* sph = sphere->copyRec(); + sph->translate(float(iX), + float(iY), + float(iZ), + TS_object); + scene->addChild(sph); + } + } + } + } + + SLuint num = (SLuint)(size + size + 1); + + SL_LOG_DEBUG("Triangles on GPU : %u", res * res * 2 * num * num * num); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneFrustum.h b/apps/app_demo/source/scenes/AppDemoSceneFrustum.h new file mode 100644 index 00000000..eab3a64b --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneFrustum.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneFrustum.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEFRUSTUM_H +#define APPDEMOSCENEFRUSTUM_H + +#include + +//----------------------------------------------------------------------------- +//! Class for frustum culling test scene +class AppDemoSceneFrustum : public SLScene +{ +public: + AppDemoSceneFrustum(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneGLTF.cpp b/apps/app_demo/source/scenes/AppDemoSceneGLTF.cpp new file mode 100644 index 00000000..08ae924f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneGLTF.cpp @@ -0,0 +1,147 @@ +/** + * \file AppDemoSceneGLTF.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneGLTF::AppDemoSceneGLTF(SLSceneID sceneID) + : SLScene("GLTF File Demo Scene"), + _sceneID(sceneID) +{ + info("GLTF File Format Test Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneGLTF::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addSkyboxToLoad(_skybox, + al.modelPath() + + "GLTF/glTF-Sample-Models/hdris/envmap_malibu.hdr", + SLVec2i(256, 256), + "HDR Skybox"); + switch (_sceneID) + { + case SID_glTF_DamagedHelmet: + _modelFile = "GLTF/glTF-Sample-Models/2.0/DamagedHelmet/glTF/DamagedHelmet.gltf"; + this->name("glTF-Sample-Model: Damaged Helmet"); + break; + case SID_glTF_FlightHelmet: + _modelFile = "GLTF/glTF-Sample-Models/2.0/FlightHelmet/glTF/FlightHelmet.gltf"; + this->name("glTF-Sample-Model: Flight Helmet"); + break; + case SID_glTF_Sponza: + _modelFile = "GLTF/glTF-Sample-Models/2.0/Sponza/glTF/Sponza.gltf"; + this->name("glTF-Sample-Model: Sponza Palace in Dubrovnic"); + break; + case SID_glTF_WaterBottle: + _modelFile = "GLTF/glTF-Sample-Models/2.0/WaterBottle/glTF/WaterBottle.gltf"; + this->name("glTF-Sample-Model: WaterBottle"); + break; + } + + al.addNodeToLoad(_modelGLTF, + AppCommon::modelPath + _modelFile); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneGLTF::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLVec3f camPos, lookAt; + SLfloat camClipFar = 100; + + switch (_sceneID) + { + case SID_glTF_DamagedHelmet: + { + camPos.set(0, 0, 3); + lookAt.set(0, camPos.y, 0); + camClipFar = 20; + break; + } + case SID_glTF_FlightHelmet: + { + camPos.set(0, 0.33f, 1.1f); + lookAt.set(0, camPos.y, 0); + camClipFar = 20; + break; + } + case SID_glTF_Sponza: + { + camPos.set(-8, 1.6f, 0); + lookAt.set(0, camPos.y, 0); + break; + } + case SID_glTF_WaterBottle: + { + camPos.set(0, 0, 0.5f); + lookAt.set(0, camPos.y, 0); + camClipFar = 20; + break; + } + } + + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + // Create camera and initialize its parameters + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(camPos); + cam1->lookAt(lookAt); + cam1->background().colors(SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->focalDist(camPos.length()); + cam1->clipFar(camClipFar); + cam1->setInitialState(); + scene->addChild(cam1); + + // Add directional light with a position that corresponds roughly to the sun direction + SLLight::gamma = 2.2f; + SLLightDirect* light = new SLLightDirect(am, + this, + 0.55f, + 1.0f, + -0.2f, + 0.2f, + 0, + 1, + 1); + light->lookAt(0, 0, 0); + light->attenuation(1, 0, 0); + light->createsShadows(true); + light->createShadowMapAutoSize(cam1, + SLVec2i(2048, 2048), + 4); + light->shadowMap()->cascadesFactor(1.0); + light->doSmoothShadows(true); + light->castsShadows(false); + light->shadowMinBias(0.001f); + light->shadowMaxBias(0.003f); + scene->addChild(light); + + // Update all materials and set their skybox to _skybox + _modelGLTF->updateMeshMat([=](SLMaterial* m) + { m->skybox(_skybox); }, + true); + + scene->addChild(_modelGLTF); + + this->skybox(_skybox); + sv->camera(cam1); + sv->doWaitOnIdle(true); // Saves energy +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneGLTF.h b/apps/app_demo/source/scenes/AppDemoSceneGLTF.h new file mode 100644 index 00000000..91e43b6b --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneGLTF.h @@ -0,0 +1,53 @@ +/** + * \file AppDemoSceneGLTF.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEGLTF_H +#define APPDEMOSCENEGLTF_H + +#include + +//----------------------------------------------------------------------------- +//! Class for GLTF file loading demo scene +class AppDemoSceneGLTF : public SLScene +{ +public: + AppDemoSceneGLTF(SLSceneID sceneID); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; + SLstring _modelFile; + SLSkybox* _skybox; + SLNode* _modelGLTF; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.cpp b/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.cpp new file mode 100644 index 00000000..8acba6f5 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.cpp @@ -0,0 +1,215 @@ +/** + * \file AppDemoSceneJansUniverse.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +#ifndef SL_GLES +const SLuint NUM_MAT_MESH = 100; +const SLuint NUM_LEVELS = 6; +const SLuint NUM_CHILDREN = 8; +#else +const SLuint NUM_MAT_MESH = 20; +const SLuint NUM_LEVELS = 6; +const SLuint NUM_CHILDREN = 8; +#endif + +//----------------------------------------------------------------------------- +AppDemoSceneJansUniverse::AppDemoSceneJansUniverse() + : SLScene("Jan's Universe Test Scene") +{ + info("Jan's Universe Test Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneJansUniverse::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_textureC, AppCommon::texturePath + "rusty-metal_2048_C.jpg"); + al.addTextureToLoad(_textureM, AppCommon::texturePath + "rusty-metal_2048_M.jpg"); + al.addTextureToLoad(_textureR, AppCommon::texturePath + "rusty-metal_2048_R.jpg"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneJansUniverse::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(1000); + cam1->translation(0, 0, 75); + cam1->focalDist(75); + cam1->lookAt(0, 0, 0); + cam1->background().colors(SLCol4f(0.3f, 0.3f, 0.3f)); + cam1->setInitialState(); + + // Root scene node + SLNode* scene = new SLNode; + root3D(scene); + scene->addChild(cam1); + + // Generate NUM_MAT cook-torrance materials + SLVMaterial materials(NUM_MAT_MESH); + for (int i = 0; i < NUM_MAT_MESH; ++i) + { + SLstring matName = "mat-" + std::to_string(i); + materials[i] = new SLMaterial(am, + matName.c_str(), + nullptr, + _textureC, + nullptr, + _textureM, + _textureR, + nullptr, + nullptr); + SLCol4f color; + color.hsva2rgba(SLVec4f(Utils::TWOPI * (float)i / (float)NUM_MAT_MESH, 1.0f, 1.0f)); + materials[i]->diffuse(color); + } + + // Generate NUM_MESH sphere meshes + SLVMesh meshes(NUM_MAT_MESH); + for (int i = 0; i < NUM_MAT_MESH; ++i) + { + SLstring meshName = "mesh-" + std::to_string(i); + meshes[i] = new SLSphere(am, + 1.0f, + 32, + 32, + meshName.c_str(), + materials[i % NUM_MAT_MESH]); + } + + // Create universe + generateUniverse(am, + this, + scene, + 0, + NUM_LEVELS, + NUM_CHILDREN, + materials, + meshes); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- +//! Adds another level to Jan's Universe scene +void AppDemoSceneJansUniverse::addUniverseLevel(SLAssetManager* am, + SLScene* s, + SLNode* parent, + SLint parentID, + SLuint currentLevel, + SLuint levels, + SLuint childCount, + SLVMaterial& materials, + SLVMesh& meshes, + SLuint& numNodes) +{ + if (currentLevel >= levels) + return; + + const float degPerChild = 360.0f / (float)childCount; + SLuint mod = currentLevel % 3; + + float scaleFactor = 0.25f; + + for (SLuint i = 0; i < childCount; i++) + { + numNodes++; + string childName = "Node" + std::to_string(numNodes) + + "-L" + std::to_string(currentLevel) + + "-C" + std::to_string(i); + SLNode* child = new SLNode(meshes[numNodes % meshes.size()], childName); + + child->rotate((float)i * degPerChild, 0, 0, 1); + child->translate(2, 0, 0); + child->scale(scaleFactor); + + // Node animation on child node + string animName = "Anim" + std::to_string(numNodes); + SLAnimation* childAnim = s->animManager().createNodeAnimation(animName.c_str(), + 60, + true, + EC_linear, + AL_loop); + childAnim->createNodeAnimTrackForRotation360(child, {0, 0, 1}); + + parent->addChild(child); + + addUniverseLevel(am, + s, + child, + parentID, + currentLevel + 1, + levels, + childCount, + materials, + meshes, + numNodes); + } +} +//----------------------------------------------------------------------------- +//! Generates the Jan's Universe scene +void AppDemoSceneJansUniverse::generateUniverse(SLAssetManager* am, + SLScene* s, + SLNode* parent, + SLint parentID, + SLuint levels, + SLuint childCount, + SLVMaterial& materials, + SLVMesh& meshes) +{ + // Point light without mesh + SLLightSpot* light = new SLLightSpot(am, + s, + 0, + 0, + 0, + 1.0f, + 180, + 0, + 1000, + 1000, + true); + light->attenuation(1, 0, 0); + light->scale(10, 10, 10); + light->diffuseColor({1.0f, 1.0f, 0.5f}); + + // Node animation on light node + SLAnimation* lightAnim = s->animManager().createNodeAnimation("anim0", + 60, + true, + EC_linear, + AL_loop); + lightAnim->createNodeAnimTrackForRotation360(light, + SLVec3f(0, 1, 0)); + + parent->addChild(light); + + SLuint numNodes = 1; + + addUniverseLevel(am, + s, + light, + parentID, + 1, + levels, + childCount, + materials, + meshes, + numNodes); +} +//------------------------------------------------------------------------------ \ No newline at end of file diff --git a/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.h b/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.h new file mode 100644 index 00000000..3a6895db --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneJansUniverse.h @@ -0,0 +1,70 @@ +/** + * \file AppDemoSceneJansUniverse.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEJANSUNIVERSE_H +#define APPDEMOSCENEJANSUNIVERSE_H + +#include "SLGLTexture.h" +#include + +//----------------------------------------------------------------------------- +//! Class for the benchmark scene for Jans Universe +class AppDemoSceneJansUniverse : public SLScene +{ +public: + AppDemoSceneJansUniverse(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + void addUniverseLevel(SLAssetManager* am, + SLScene* s, + SLNode* parent, + SLint parentID, + SLuint currentLevel, + SLuint levels, + SLuint childCount, + SLVMaterial& materials, + SLVMesh& meshes, + SLuint& numNodes); + void generateUniverse(SLAssetManager* am, + SLScene* s, + SLNode* parent, + SLint parentID, + SLuint levels, + SLuint childCount, + SLVMaterial& materials, + SLVMesh& meshes); + +private: + SLGLTexture* _textureC; + SLGLTexture* _textureM; + SLGLTexture* _textureR; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneLargeModel.cpp b/apps/app_demo/source/scenes/AppDemoSceneLargeModel.cpp new file mode 100644 index 00000000..84f7571a --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLargeModel.cpp @@ -0,0 +1,79 @@ +/** + * \file AppDemoSceneLargeModel.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneLargeModel::AppDemoSceneLargeModel() + : SLScene("Large Model Benchmark Scene") +{ + info("Large Model with 7.2 mio. triangles."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneLargeModel::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_dragonModel, + AppCommon::modelPath + + "PLY/xyzrgb_dragon/xyzrgb_dragon.ply", + nullptr, + false, + true, + nullptr, + 0.2f, + false, + SLProcess_Triangulate | + SLProcess_JoinIdenticalVertices); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneLargeModel::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 220); + cam1->lookAt(0, 0, 0); + cam1->clipNear(1); + cam1->clipFar(10000); + cam1->focalDist(220); + cam1->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Material for glass + SLMaterial* diffuseMat = new SLMaterial(am, + "diffuseMat", + SLCol4f::WHITE, + SLCol4f::WHITE); + _dragonModel->mesh()->mat(diffuseMat); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 200, + 200, + 200, + 1); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(_dragonModel); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneLargeModel.h b/apps/app_demo/source/scenes/AppDemoSceneLargeModel.h new file mode 100644 index 00000000..257c0bb6 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLargeModel.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneLargeModel.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENELARGEMODEL_H +#define APPDEMOSCENELARGEMODEL_H + +#include + +//----------------------------------------------------------------------------- +//! Class for large model scene +class AppDemoSceneLargeModel : public SLScene +{ +public: + AppDemoSceneLargeModel(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _dragonModel; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.cpp b/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.cpp new file mode 100644 index 00000000..470f106d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.cpp @@ -0,0 +1,185 @@ +/** + * \file AppDemoSceneLevelOfDetail.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marcus Hudritsch + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneLevelOfDetail::AppDemoSceneLevelOfDetail(SLSceneID sceneID) + : SLScene("Level of Detail Test"), + _sceneID(sceneID) +{ +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneLevelOfDetail::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texFloorDif, + AppCommon::modelPath + + "GLTF/CorinthianColumn/PavementSlateSquare2_2K_DIF.jpg", + SL_ANISOTROPY_MAX); + al.addTextureToLoad(_texFloorNrm, + AppCommon::modelPath + + "GLTF/CorinthianColumn/PavementSlateSquare2_2K_NRM.jpg", + SL_ANISOTROPY_MAX); + al.addNodeToLoad(_columnLOD, + AppCommon::modelPath + + "GLTF/CorinthianColumn/Corinthian-Column-Round-LOD.gltf"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneLevelOfDetail::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLchar name[512]; + SLint size; + if (_sceneID == SID_Benchmark_ColumnsNoLOD) + { + size = 25; + snprintf(name, + sizeof(name), + "%d corinthian columns without LOD", + size * size); + this->name(name); + } + else + { + size = 50; + snprintf(name, + sizeof(name), + "%d corinthian columns with LOD", + size * size); + this->name(name); + } + info(this->name() + " with cascaded shadow mapping. In the Day-Time dialogue you can change the sun angle."); + + // Create ground material + SLMaterial* matFloor = new SLMaterial(am, + "matFloor", + _texFloorDif, + _texFloorNrm); + + // Define camera + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 1.7f, 20); + cam1->lookAt(0, 1.7f, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->clipFar(600); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + + // Create directional light for the sunlight + SLLightDirect* sunLight = new SLLightDirect(am, this, 1.0f); + sunLight->powers(0.25f, 1.0f, 1.0f); + sunLight->attenuation(1, 0, 0); + sunLight->translation(0, 1.7f, 0); + sunLight->lookAt(-1, 0, -1); + sunLight->doSunPowerAdaptation(true); + + // Add cascaded shadow mapping + sunLight->createsShadows(true); + sunLight->createShadowMapAutoSize(cam1); + sunLight->doSmoothShadows(true); + sunLight->castsShadows(false); + sunLight->shadowMinBias(0.003f); + sunLight->shadowMaxBias(0.012f); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(sunLight); + AppCommon::devLoc.originLatLonAlt(47.14271, + 7.24337, + 488.2); // Ecke Giosa + AppCommon::devLoc.defaultLatLonAlt(47.14260, + 7.24310, + 488.7 + 1.7); // auf Parkplatz + + // Floor rectangle + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-200, -200), + SLVec2f(200, 200), + SLVec2f(0, 0), + SLVec2f(50, 50), + 50, + 50, + "Floor", + matFloor)); + rect->rotate(90, -1, 0, 0); + rect->castsShadows(false); + + // Configure the corinthian column + SLNode* columnL0 = _columnLOD->findChild("Corinthian-Column-Round-L0"); + SLNode* columnL1 = _columnLOD->findChild("Corinthian-Column-Round-L1"); + SLNode* columnL2 = _columnLOD->findChild("Corinthian-Column-Round-L2"); + SLNode* columnL3 = _columnLOD->findChild("Corinthian-Column-Round-L3"); + + // Assemble scene + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(sunLight); + scene->addChild(rect); + scene->addChild(cam1); + + // create loads of pillars + SLint numColumns = size * size; + SLfloat offset = 5.0f; + SLfloat z = (float)(size - 1) * offset * 0.5f; + + for (SLint iZ = 0; iZ < size; ++iZ) + { + SLfloat x = -(float)(size - 1) * offset * 0.5f; + + for (SLint iX = 0; iX < size; ++iX) + { + SLint iZX = iZ * size + iX; + + if (_sceneID == SID_Benchmark_ColumnsNoLOD) + { + // Without just the level 0 node + string strNode = "Node" + std::to_string(iZX); + SLNode* column = new SLNode(columnL1->mesh(), + strNode + "-L0"); + column->translate(x, 0, z, TS_object); + scene->addChild(column); + } + else + { + // With LOD parent node and 3 levels + string strLOD = "LOD" + std::to_string(iZX); + SLNodeLOD* lod_group = new SLNodeLOD(strLOD); + lod_group->translate(x, 0, z, TS_object); + lod_group->addChildLOD(new SLNode(columnL1->mesh(), + strLOD + "-L0"), + 0.1f, + 3); + lod_group->addChildLOD(new SLNode(columnL2->mesh(), + strLOD + "-L1"), + 0.01f, + 3); + lod_group->addChildLOD(new SLNode(columnL3->mesh(), + strLOD + "-L2"), + 0.0001f, + 3); + scene->addChild(lod_group); + } + x += offset; + } + z -= offset; + } + + // Set active camera & the root pointer + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.h b/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.h new file mode 100644 index 00000000..97594e74 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLevelOfDetail.h @@ -0,0 +1,52 @@ +/** + * \file AppDemoSceneLevelOfDetail.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENELEVELOFDETAIL_H +#define APPDEMOSCENELEVELOFDETAIL_H + +#include + +//----------------------------------------------------------------------------- +//! Class for level of detail test scene +class AppDemoSceneLevelOfDetail : public SLScene +{ +public: + AppDemoSceneLevelOfDetail(SLSceneID sceneID); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; + SLGLTexture* _texFloorDif; + SLGLTexture* _texFloorNrm; + SLNode* _columnLOD; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.cpp b/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.cpp new file mode 100644 index 00000000..c01cb4bb --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.cpp @@ -0,0 +1,110 @@ +/** + * \file AppDemoSceneLotsOfNodes.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneLotsOfNodes::AppDemoSceneLotsOfNodes() + : SLScene("Lots of Nodes Benchmark Scene") +{ + info("Lots of Nodes Benchmark Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneLotsOfNodes::registerAssetsToLoad(SLAssetLoader& al) +{ + for (int i = 0; i < _NUM_MAT; ++i) + { + al.addTextureToLoad(_texC[i], + AppCommon::texturePath + "earth2048_C_Q95.jpg"); + } +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneLotsOfNodes::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 0, 50); + cam1->lookAt(0, 0, 0); + cam1->focalDist(50); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 15, + 15, + 15, + 0.3f); + light1->powers(0.2f, 0.8f, 1.0f); + light1->attenuation(1, 0, 0); + + SLNode* scene = new SLNode; + root3D(scene); + scene->addChild(cam1); + scene->addChild(light1); + + // Generate NUM_MAT materials + SLVMaterial mat; + for (int i = 0; i < _NUM_MAT; ++i) + { + SLstring matName = "mat-" + std::to_string(i); + mat.push_back(new SLMaterial(am, matName.c_str(), _texC[i])); + SLCol4f color; + color.hsva2rgba(SLVec4f(Utils::TWOPI * (float)i / (float)_NUM_MAT, + 1.0f, + 1.0f)); + mat[i]->diffuse(color); + } + + // create a 3D array of spheres + SLint halfSize = 10; + SLuint n = 0; + for (SLint iZ = -halfSize; iZ <= halfSize; ++iZ) + { + for (SLint iY = -halfSize; iY <= halfSize; ++iY) + { + for (SLint iX = -halfSize; iX <= halfSize; ++iX) + { + // Choose a random material index + SLuint res = 36; + SLint iMat = (SLint)Utils::random(0, _NUM_MAT - 1); + SLstring nodeName = "earth-" + std::to_string(n); + + // Create a new sphere and node and translate it + SLSphere* earth = new SLSphere(am, + 0.3f, + res, + res, + nodeName, + mat[iMat]); + SLNode* sphere = new SLNode(earth); + sphere->translate(float(iX), + float(iY), + float(iZ), + TS_object); + scene->addChild(sphere); + n++; + } + } + } + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.h b/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.h new file mode 100644 index 00000000..d60eda9f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneLotsOfNodes.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneLotsOfNodes.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENELOTSOFNODES_H +#define APPDEMOSCENELOTSOFNODES_H + +#include +#include + +//----------------------------------------------------------------------------- +//! Class for scene with lots of nodes +class AppDemoSceneLotsOfNodes : public SLScene +{ +public: + AppDemoSceneLotsOfNodes(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + static const int _NUM_MAT = 20; + SLGLTexture* _texC[_NUM_MAT]; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.cpp b/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.cpp new file mode 100644 index 00000000..6ab6a2e3 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.cpp @@ -0,0 +1,133 @@ +/** + * \file AppDemoSceneMeshLoad.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneMeshLoad::AppDemoSceneMeshLoad() : SLScene("Mesh 3D Loader Test") +{ + info("We use the assimp library to load 3D file formats including materials, skeletons and animations. " + "You can view the skeleton with key K. You can stop all animations with SPACE key.\n" + "Switch between perspective and orthographic projection with key 5. " + "Switch to front view with key 1, to side view with key 3 and to top view with key 7.\n" + "Try the different stereo rendering modes in the menu Camera."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneMeshLoad::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_mesh3DS, + AppCommon::modelPath + + "3DS/Halloween/jackolan.3ds"); + al.addNodeToLoad(_meshFBX, + AppCommon::modelPath + + "FBX/Duck/duck.fbx"); + al.addNodeToLoad(_meshDAE, + AppCommon::modelPath + + "DAE/AstroBoy/AstroBoy.dae"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneMeshLoad::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* matBlu = new SLMaterial(am, "Blue", SLCol4f(0, 0, 0.2f), SLCol4f(1, 1, 1), 100, 0.8f, 0); + SLMaterial* matRed = new SLMaterial(am, "Red", SLCol4f(0.2f, 0, 0), SLCol4f(1, 1, 1), 100, 0.8f, 0); + SLMaterial* matGre = new SLMaterial(am, "Green", SLCol4f(0, 0.2f, 0), SLCol4f(1, 1, 1), 100, 0.8f, 0); + SLMaterial* matGra = new SLMaterial(am, "Gray", SLCol4f(0.3f, 0.3f, 0.3f), SLCol4f(1, 1, 1), 100, 0, 0); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(.1f); + cam1->clipFar(30); + cam1->translation(0, 0, 12); + cam1->lookAt(0, 0, 0); + cam1->focalDist(12); + cam1->stereoEyeSeparation(cam1->focalDist() / 30.0f); + cam1->background().colors(SLCol4f(0.6f, 0.6f, 0.6f), SLCol4f(0.3f, 0.3f, 0.3f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light1 = new SLLightSpot(am, this, 2.5f, 2.5f, 2.5f, 0.2f); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + SLAnimation* anim = this->animManager().createNodeAnimation("anim_light1_backforth", 2.0f, true, EC_inOutQuad, AL_pingPongLoop); + anim->createNodeAnimTrackForTranslation(light1, SLVec3f(0.0f, 0.0f, -5.0f)); + + SLLightSpot* light2 = new SLLightSpot(am, this, -2.5f, -2.5f, 2.5f, 0.2f); + light2->powers(0.1f, 1.0f, 1.0f); + light2->attenuation(1, 0, 0); + anim = this->animManager().createNodeAnimation("anim_light2_updown", 2.0f, true, EC_inOutQuint, AL_pingPongLoop); + anim->createNodeAnimTrackForTranslation(light2, SLVec3f(0.0f, 5.0f, 0.0f)); + + // Start animation + SLAnimPlayback* charAnim = this->animManager().animPlaybacksBack(); + charAnim->playForward(); + charAnim->playbackRate(0.8f); + + // Scale to so that the AstroBoy is about 2 (meters) high. + if (_mesh3DS) + { + _mesh3DS->scale(0.1f); + _mesh3DS->translate(-22.0f, 1.9f, 3.5f, TS_object); + } + if (_meshDAE) + { + _meshDAE->translate(0, -3, 0, TS_object); + _meshDAE->scale(2.7f); + } + if (_meshFBX) + { + _meshFBX->scale(0.1f); + _meshFBX->scale(0.1f); + _meshFBX->translate(200, 30, -30, TS_object); + _meshFBX->rotate(-90, 0, 1, 0); + } + + // define rectangles for the surrounding box + SLfloat b = 3; // edge size of rectangles + SLNode *rb, *rl, *rr, *rf, *rt; + SLuint res = 20; + rb = new SLNode(new SLRectangle(am, SLVec2f(-b, -b), SLVec2f(b, b), res, res, "rectB", matBlu), "rectBNode"); + rb->translate(0, 0, -b, TS_object); + rl = new SLNode(new SLRectangle(am, SLVec2f(-b, -b), SLVec2f(b, b), res, res, "rectL", matRed), "rectLNode"); + rl->rotate(90, 0, 1, 0); + rl->translate(0, 0, -b, TS_object); + rr = new SLNode(new SLRectangle(am, SLVec2f(-b, -b), SLVec2f(b, b), res, res, "rectR", matGre), "rectRNode"); + rr->rotate(-90, 0, 1, 0); + rr->translate(0, 0, -b, TS_object); + rf = new SLNode(new SLRectangle(am, SLVec2f(-b, -b), SLVec2f(b, b), res, res, "rectF", matGra), "rectFNode"); + rf->rotate(-90, 1, 0, 0); + rf->translate(0, 0, -b, TS_object); + rt = new SLNode(new SLRectangle(am, SLVec2f(-b, -b), SLVec2f(b, b), res, res, "rectT", matGra), "rectTNode"); + rt->rotate(90, 1, 0, 0); + rt->translate(0, 0, -b, TS_object); + + SLNode* scene = new SLNode("Scene"); + this->root3D(scene); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(rb); + scene->addChild(rl); + scene->addChild(rr); + scene->addChild(rf); + scene->addChild(rt); + if (_mesh3DS) scene->addChild(_mesh3DS); + if (_meshFBX) scene->addChild(_meshFBX); + if (_meshDAE) scene->addChild(_meshDAE); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.h b/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.h new file mode 100644 index 00000000..b31409fe --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneMeshLoad.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneMeshLoad.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEMESHLOAD_H +#define APPDEMOSCENEMESHLOAD_H + +#include + +//----------------------------------------------------------------------------- +//! Class for mesh loading demo scene +class AppDemoSceneMeshLoad : public SLScene +{ +public: + AppDemoSceneMeshLoad(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _mesh3DS; + SLNode* _meshFBX; + SLNode* _meshDAE; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneMinimal.cpp b/apps/app_demo/source/scenes/AppDemoSceneMinimal.cpp new file mode 100644 index 00000000..16f27583 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneMinimal.cpp @@ -0,0 +1,71 @@ +/** + * \file AppDemoSceneMinimal.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneMinimal::AppDemoSceneMinimal() : SLScene("Minimal Scene") +{ + info("Minimal scene with a texture mapped rectangle with a point light source.\n" + "You can find all other test scenes in the menu File > Load Test Scenes." + "You can jump to the next scene with the Shift-Alt-CursorRight.\n" + "You can open various info windows under the menu Infos. You can drag, dock and stack them on all sides.\n" + "You can rotate the scene with click and drag on the left mouse button (LMB).\n" + "You can zoom in/out with the mousewheel. You can pan with click and drag on the middle mouse button (MMB).\n"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneMinimal::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "earth2048_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneMinimal::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create materials + SLMaterial* m1 = new SLMaterial(am, "m1", _texC); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->translation(0, 0, 5); + light1->name("light node"); + scene->addChild(light1); + + // Create meshes and nodes + SLMesh* rectMesh = new SLRectangle(am, + SLVec2f(-5, -5), + SLVec2f(5, 5), + 25, + 25, + "rectangle mesh", + m1); + SLNode* rectNode = new SLNode(rectMesh, "rectangle node"); + scene->addChild(rectNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneMinimal.h b/apps/app_demo/source/scenes/AppDemoSceneMinimal.h new file mode 100644 index 00000000..3338d529 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneMinimal.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneMinimal.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEMINIMAL_H +#define APPDEMOSCENEMINIMAL_H + +#include + +//----------------------------------------------------------------------------- +//! Class for minimal scene +class AppDemoSceneMinimal : public SLScene +{ +public: + AppDemoSceneMinimal(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.cpp new file mode 100644 index 00000000..d6a43629 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.cpp @@ -0,0 +1,579 @@ +/** + * \file AppDemoSceneParticleComplexFire.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleComplexFire::AppDemoSceneParticleComplexFire(SLSceneID sceneID) + : SLScene("Complex Fire Particle System"), + _sceneID(sceneID) +{ + info("The fire particle systems contain each multiple sub particle systems.\n" + "See the scenegraph window for the sub particles systems. " + "See the properties window for the settings of the particles systems"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleComplexFire::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texFireCld, + AppCommon::texturePath + + "ParticleFirecloudTransparent_C.png"); + al.addTextureToLoad(_texFireFlm, + AppCommon::texturePath + + "ParticleFlames_06_8x8_C.png"); + al.addTextureToLoad(_texCircle, + AppCommon::texturePath + + "ParticleCircle_05_C.png"); + al.addTextureToLoad(_texSmokeB, + AppCommon::texturePath + + "ParticleCloudBlack_C.png"); + al.addTextureToLoad(_texSmokeW, + AppCommon::texturePath + + "ParticleCloudWhite_C.png"); + al.addTextureToLoad(_texTorchFlm, + AppCommon::texturePath + + "ParticleFlames_04_16x4_C.png"); + al.addTextureToLoad(_texTorchSmk, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + + al.addTextureToLoad(_texWallDIF, + AppCommon::texturePath + + "BrickLimestoneGray_1K_DIF.jpg", + SL_ANISOTROPY_MAX, + GL_LINEAR); + al.addTextureToLoad(_texWallNRM, + AppCommon::texturePath + + "BrickLimestoneGray_1K_NRM.jpg", + SL_ANISOTROPY_MAX, + GL_LINEAR); + + al.addNodeToLoad(_firewood, + AppCommon::modelPath + + "GLTF/Firewood/Firewood1.gltf", + nullptr, + false, + true, + nullptr, + 0.3f, + true); + al.addNodeToLoad(_torchL, + AppCommon::modelPath + + "GLTF/Torch/Torch.gltf", + nullptr, + false, + true, + nullptr, + 0.3f, + true); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleComplexFire::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + if (_sceneID == SID_ParticleSystem_ComplexFire) + { + // Create a scene group node + SLNode* scene = new SLNode("Root Scene Node"); + root3D(scene); + + // Create and add camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 1.2f, 4.0f); + cam1->lookAt(0, 1.2f, 0); + cam1->focalDist(4.5f); + cam1->setInitialState(); + scene->addChild(cam1); + sv->camera(cam1); + + SLNode* complexFire = createComplexFire(am, + this, + true, + _texTorchSmk, + _texFireFlm, + 8, + 8, + _texCircle, + _texSmokeB, + _texSmokeW); + scene->addChild(complexFire); + + // Room parent node + SLNode* room = new SLNode("Room"); + scene->addChild(room); + + // Back wall material + SLMaterial* matWall = new SLMaterial(am, + "mat3", + _texWallDIF, + _texWallNRM); + matWall->specular(SLCol4f::BLACK); + matWall->metalness(0); + matWall->roughness(1); + matWall->reflectionModel(RM_CookTorrance); + + // Room dimensions + SLfloat pL = -2.0f, pR = 2.0f; // left/right + SLfloat pB = -0.01f, pT = 4.0f; // bottom/top + SLfloat pN = 2.0f, pF = -2.0f; // near/far + + // bottom rectangle + SLNode* b = new SLNode(new SLRectangle(am, + SLVec2f(pL, -pN), + SLVec2f(pR, -pF), + 10, + 10, + "Floor", + matWall)); + b->rotate(90, -1, 0, 0); + b->translate(0, 0, pB, TS_object); + room->addChild(b); + + // far rectangle + SLNode* f = new SLNode(new SLRectangle(am, + SLVec2f(pL, pB), + SLVec2f(pR, pT), + 10, + 10, + "Wall far", + matWall)); + f->translate(0, 0, pF, TS_object); + room->addChild(f); + + // near rectangle + SLNode* n = new SLNode(new SLRectangle(am, + SLVec2f(pL, pB), + SLVec2f(pR, pT), + 10, + 10, + "Wall near", + matWall)); + n->rotate(180, 0, 1, 0); + n->translate(0, 0, pF, TS_object); + room->addChild(n); + + // left rectangle + SLNode* l = new SLNode(new SLRectangle(am, + SLVec2f(-pN, pB), + SLVec2f(-pF, pT), + 10, + 10, + "Wall left", + matWall)); + l->rotate(90, 0, 1, 0); + l->translate(0, 0, pL, TS_object); + room->addChild(l); + + // right rectangle + SLNode* r = new SLNode(new SLRectangle(am, + SLVec2f(pF, pB), + SLVec2f(pN, pT), + 10, + 10, + "Wall right", + matWall)); + r->rotate(90, 0, -1, 0); + r->translate(0, 0, -pR, TS_object); + room->addChild(r); + + // Firewood + _firewood->scale(2); + scene->addChild(_firewood); + + // Torch + _torchL->name("Torch Left"); + SLNode* torchR = _torchL->copyRec(); + torchR->name("Torch Right"); + _torchL->translate(-2, 1.5f, 0); + _torchL->rotate(90, 0, 1, 0); + _torchL->scale(2); + scene->addChild(_torchL); + torchR->translate(2, 1.5f, 0); + torchR->rotate(-90, 0, 1, 0); + torchR->scale(2); + scene->addChild(torchR); + + // Torch flame left + SLNode* torchFlameNodeL = createTorchFire(am, + this, + true, + _texTorchSmk, + _texTorchFlm, + 16, + 4); + torchFlameNodeL->translate(-1.6f, 2.25f, 0); + torchFlameNodeL->name("Torch Fire Left"); + scene->addChild(torchFlameNodeL); + + // Torch flame right + SLNode* torchFlameNodeR = createTorchFire(am, + this, + true, + _texTorchSmk, + _texTorchFlm, + 16, + 4); + torchFlameNodeR->translate(1.6f, 2.25f, 0); + torchFlameNodeR->name("Torch Fire Right"); + scene->addChild(torchFlameNodeR); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(false); + } + else if (_sceneID == SID_Benchmark_ParticleSystemComplexFire) + { + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(1000); + cam1->translation(0, 10, 40); + cam1->focalDist(100); + cam1->lookAt(0, 0, 0); + cam1->background().colors(SLCol4f(0.3f, 0.3f, 0.3f)); + cam1->setInitialState(); + + // Root scene node + SLNode* root = new SLNode("Root Scene Node"); + root3D(root); + root->addChild(cam1); + const int NUM_NODES = 250; + + SLVNode nodes(NUM_NODES); + for (int i = 0; i < NUM_NODES; ++i) + { + SLNode* fireComplex = createComplexFire(am, + this, + false, + _texFireCld, + _texFireFlm, + 8, + 4, + _texCircle, + _texSmokeB, + _texSmokeW); + fireComplex->translate(-20.0f + (float)(i % 20) * 2, + 0.0f, + -(float)((i - (float)(i % 20)) / 20) * 4, + TS_object); + root->addChild(fireComplex); + } + + sv->camera(cam1); + sv->doWaitOnIdle(false); + } + else + SL_EXIT_MSG("Assemble: Should not get here!"); +} +//----------------------------------------------------------------------------- +//! Creates a complex fire group node +SLNode* AppDemoSceneParticleComplexFire::createComplexFire(SLAssetManager* am, + SLScene* s, + SLbool withLight, + SLGLTexture* texFireCld, + SLGLTexture* texFireFlm, + SLint flipbookCols, + SLint flipbookRows, + SLGLTexture* texCircle, + SLGLTexture* texSmokeB, + SLGLTexture* texSmokeW) +{ + SLNode* fireComplex = new SLNode("Fire complex node"); + + // Fire light node + if (withLight) + { + SLLightSpot* light1 = new SLLightSpot(am, + s, + 0.1f, + 180.0f, + false); + light1->name("Fire light node"); + light1->translate(0, 1.0f, 0); + light1->diffuseColor(SLCol4f(1, 0.7f, 0.2f)); + light1->diffusePower(15); + light1->attenuation(0, 0, 1); + fireComplex->addChild(light1); + } + + // Fire glow mesh + { + SLParticleSystem* glow = new SLParticleSystem(am, + 24, + SLVec3f(-0.1f, 0.0f, -0.1f), + SLVec3f(0.1f, 0.0f, 0.1f), + 4.0f, + texFireCld, + "Fire glow PS"); + glow->timeToLive(2.0f); + glow->billboardType(BT_Camera); + glow->radiusW(0.4f); + glow->radiusH(0.4f); + glow->doShape(false); + glow->doRotation(true); + glow->doRotRange(true); + glow->doSizeOverLT(false); + glow->doAlphaOverLT(false); + glow->doColorOverLT(false); + glow->doBlendBrightness(true); + glow->color(SLCol4f(0.925f, 0.5f, 0.097f, 0.199f)); + glow->doAcceleration(false); + SLNode* flameGlowNode = new SLNode(glow, "Fire glow node"); + flameGlowNode->translate(0, 0.15f, 0); + fireComplex->addChild(flameGlowNode); + } + + // Fire flame mesh + { + SLParticleSystem* flame = new SLParticleSystem(am, + 1, + SLVec3f(0.0f, 0.0f, 0.0f), + SLVec3f(0.0f, 0.0f, 0.0f), + + 1.0f, + texFireCld, + "Fire flame PS", + texFireFlm); + // Fire flame flipbook settings + flame->flipbookColumns(flipbookCols); + flame->flipbookRows(flipbookRows); + flame->doFlipBookTexture(true); + flame->doCounterGap(false); // We don't want to have flickering + + flame->doAlphaOverLT(false); + flame->doSizeOverLT(false); + flame->doRotation(false); + + flame->frameRateFB(64); + flame->radiusW(0.6f); + flame->radiusH(0.6f); + flame->scale(1.2f); + flame->billboardType(BT_Vertical); + + // Fire flame color + flame->doColor(true); + flame->color(SLCol4f(0.52f, 0.47f, 0.32f, 1.0f)); + flame->doBlendBrightness(true); + + // Fire flame size + flame->doSizeOverLTCurve(true); + float sizeCPArrayFl[4] = {0.0f, 1.25f, 0.0f, 1.0f}; + flame->bezierControlPointSize(sizeCPArrayFl); + float sizeSEArrayFl[4] = {0.0f, 1.0f, 1.0f, 1.0f}; + flame->bezierStartEndPointSize(sizeSEArrayFl); + flame->generateBernsteinPSize(); + + // Fire flame node + SLNode* fireFlameNode = new SLNode(flame, "Fire flame node"); + fireFlameNode->translate(0.0f, 0.7f, 0.0f, TS_object); + fireComplex->addChild(fireFlameNode); + } + + // Fire smoke black mesh + { + SLParticleSystem* smokeB = new SLParticleSystem(am, + 8, + SLVec3f(0.0f, 1.0f, 0.0f), + SLVec3f(0.0f, 0.7f, 0.0f), + 2.0f, + texSmokeB, + "Fire smoke black PS"); + smokeB->doColor(false); + + // Fire smoke black size + smokeB->doSizeOverLT(true); + smokeB->doSizeOverLTCurve(true); + float sizeCPArraySB[4] = {0.0f, 1.0f, 1.0f, 2.0f}; + smokeB->bezierControlPointSize(sizeCPArraySB); + float sizeSEArraySB[4] = {0.0f, 1.0f, 1.0f, 2.0f}; + smokeB->bezierStartEndPointSize(sizeSEArraySB); + smokeB->generateBernsteinPSize(); + + // Fire smoke black alpha + smokeB->doAlphaOverLT(true); + smokeB->doAlphaOverLTCurve(true); + float alphaCPArraySB[4] = {0.0f, 0.4f, 1.0f, 0.4f}; + smokeB->bezierControlPointAlpha(alphaCPArraySB); + float alphaSEArraySB[4] = {0.0f, 0.0f, 1.0f, 0.0f}; + smokeB->bezierStartEndPointAlpha(alphaSEArraySB); + smokeB->generateBernsteinPAlpha(); + + // Fire smoke black acceleration + smokeB->doAcceleration(true); + smokeB->doAccDiffDir(true); + smokeB->acceleration(0.0f, 0.25f, 0.3f); + + SLNode* fireSmokeBlackNode = new SLNode(smokeB, "Fire smoke black node"); + fireSmokeBlackNode->translate(0.0f, 0.9f, 0.0f, TS_object); + fireComplex->addChild(fireSmokeBlackNode); + } + + // Fire smoke white mesh + { + SLParticleSystem* smokeW = new SLParticleSystem(am, + 40, + SLVec3f(0.0f, 0.8f, 0.0f), + SLVec3f(0.0f, 0.6f, 0.0f), + 4.0f, + texSmokeW, + "Fire smoke white PS"); + + smokeW->doColor(false); + + // Size + smokeW->doSizeOverLT(true); + smokeW->doSizeOverLTCurve(true); + float sizeCPArraySW[4] = {0.0f, 0.5f, 1.0f, 2.0f}; + smokeW->bezierControlPointSize(sizeCPArraySW); + float sizeSEArraySW[4] = {0.0f, 0.5f, 1.0f, 2.0f}; + smokeW->bezierStartEndPointSize(sizeSEArraySW); + smokeW->generateBernsteinPSize(); + + // Alpha + smokeW->doAlphaOverLT(true); + smokeW->doAlphaOverLTCurve(true); + float alphaCPArraySW[4] = {0.0f, 0.018f, 1.0f, 0.018f}; + smokeW->bezierControlPointAlpha(alphaCPArraySW); + float alphaSEArraySW[4] = {0.0f, 0.0f, 1.0f, 0.0f}; + smokeW->bezierStartEndPointAlpha(alphaSEArraySW); + smokeW->generateBernsteinPAlpha(); + + // Acceleration + smokeW->doAcceleration(true); + smokeW->doAccDiffDir(true); + smokeW->acceleration(0.0f, 0.25f, 0.3f); + + SLNode* fireSmokeWNode = new SLNode(smokeW, "Fire smoke white node"); + fireSmokeWNode->translate(0.0f, 0.9f, 0.0f, TS_object); + fireComplex->addChild(fireSmokeWNode); + } + + // Fire sparks rising mesh + { + SLParticleSystem* fireSparksRising = new SLParticleSystem(am, + 30, + SLVec3f(-0.5f, 1, -0.5f), + SLVec3f(0.5f, 2, 0.5f), + 1.2f, + texCircle, + "Fire sparks rising PS", + nullptr, + new SLTexColorLUT(am, + CLUT_WYR, + 256)); + fireSparksRising->scale(0.05f); + fireSparksRising->radiusH(0.8f); + fireSparksRising->radiusW(0.3f); + fireSparksRising->doShape(true); + fireSparksRising->doRotation(false); + fireSparksRising->shapeType(ST_Sphere); + fireSparksRising->shapeRadius(0.05f); + fireSparksRising->doAcceleration(true); + fireSparksRising->acceleration(0, 1.5f, 0); + fireSparksRising->doColor(true); + fireSparksRising->doColorOverLT(true); + fireSparksRising->doBlendBrightness(true); + fireSparksRising->doSizeOverLT(false); + fireSparksRising->doAlphaOverLT(false); + fireSparksRising->doGravity(false); + fireComplex->addChild(new SLNode(fireSparksRising, + "Fire sparks rising node")); + } + + return fireComplex; +} +//----------------------------------------------------------------------------- +SLNode* AppDemoSceneParticleComplexFire::createTorchFire(SLAssetManager* am, + SLScene* s, + SLbool withLight, + SLGLTexture* texFireCld, + SLGLTexture* texFireFlm, + SLint flipbookCols, + SLint flipbookRows) +{ + + SLNode* torchFire = new SLNode("Fire torch node"); + + // Fire light node + if (withLight) + { + SLLightSpot* light1 = new SLLightSpot(am, + s, + 0.1f, + 180.0f, + false); + light1->name("Fire light node"); + light1->translate(0, 0, 0); + light1->diffuseColor(SLCol4f(1, 0.4f, 0.0f)); + light1->diffusePower(2); + light1->attenuation(0, 0, 1); + torchFire->addChild(light1); + } + + // Fire glow mesh + { + SLParticleSystem* glow = new SLParticleSystem(am, + 40, + SLVec3f(-0.1f, 0.0f, -0.1f), + SLVec3f(0.1f, 0.0f, 0.1f), + 1.5f, + texFireCld, + "Torch Glow PS"); + glow->color(SLCol4f(0.9f, 0.5f, 0, 0.63f)); + glow->doBlendBrightness(true); + glow->radiusW(0.15f); + glow->radiusH(0.15f); + glow->doSizeOverLT(false); + SLNode* fireGlowNode = new SLNode(glow, "Torch Glow Node"); + fireGlowNode->translate(0, -0.4f, 0); + torchFire->addChild(fireGlowNode); + } + + // Fire torches + { + SLParticleSystem* flame = new SLParticleSystem(am, + 1, + SLVec3f(0.0f, 0.0f, 0.0f), + SLVec3f(0.0f, 0.0f, 0.0f), + 4.0f, + texFireCld, + "Torch Flame PS", + texFireFlm); + flame->flipbookColumns(flipbookCols); + flame->flipbookRows(flipbookRows); + flame->doFlipBookTexture(true); + flame->doCounterGap(false); // We don't want to have flickering + flame->doAlphaOverLT(false); + flame->doSizeOverLT(false); + flame->doRotation(false); + flame->doColor(false); + flame->frameRateFB(64); + flame->radiusW(0.3f); + flame->radiusH(0.8f); + flame->billboardType(BT_Vertical); + SLNode* torchFlameNode = new SLNode(flame, "Torch Flame Node"); + torchFlameNode->translate(0, 0.3f, 0); + torchFire->addChild(torchFlameNode); + } + + return torchFire; +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.h b/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.h new file mode 100644 index 00000000..babf61ae --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleComplexFire.h @@ -0,0 +1,78 @@ +/** + * \file AppDemoSceneParticleComplexFire.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLEFIRE_H +#define APPDEMOSCENEPARTICLEFIRE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for fire ring particle system scene +class AppDemoSceneParticleComplexFire : public SLScene +{ +public: + AppDemoSceneParticleComplexFire(SLSceneID sceneID); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + + SLNode* createComplexFire(SLAssetManager* am, + SLScene* s, + SLbool withLight, + SLGLTexture* texFireCld, + SLGLTexture* texFireFlm, + SLint flipbookCols, + SLint flipbookRows, + SLGLTexture* texCircle, + SLGLTexture* texSmokeB, + SLGLTexture* texSmokeW); + SLNode* createTorchFire(SLAssetManager* am, + SLScene* s, + SLbool withLight, + SLGLTexture* texFireCld, + SLGLTexture* texFireFlm, + SLint flipbookCols, + SLint flipbookRows); + +private: + SLSceneID _sceneID; + SLGLTexture* _texFireCld; + SLGLTexture* _texFireFlm; + SLGLTexture* _texCircle; + SLGLTexture* _texSmokeB; + SLGLTexture* _texSmokeW; + SLGLTexture* _texTorchFlm; + SLGLTexture* _texTorchSmk; + SLGLTexture* _texWallDIF; + SLGLTexture* _texWallNRM; + SLNode* _firewood; + SLNode* _torchL; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.cpp new file mode 100644 index 00000000..7465659e --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.cpp @@ -0,0 +1,90 @@ +/** + * \file AppDemoSceneParticleDustStorm.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleDustStorm::AppDemoSceneParticleDustStorm() + : SLScene("Dust Storm Particle System") +{ + info("This dust storm particle system uses the box shape type for distribution.\n" + "See the properties window for the detailed settings of the particles system"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleDustStorm::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleDustStorm::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create and add camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 55); + cam1->lookAt(0, 0, 0); + cam1->focalDist(55); + scene->addChild(cam1); + sv->camera(cam1); + + // Create meshes and nodes + // Dust storm + SLParticleSystem* ps = new SLParticleSystem(am, + 500, + SLVec3f(-0.1f, -0.5f, -5.0f), + SLVec3f(0.1f, 0.5f, -2.5f), + 3.5f, + _texC, + "DustStorm", + _texFlip); + ps->doShape(true); + ps->shapeType(ST_Box); + ps->shapeScale(50.0f, 1.0f, 50.0f); + ps->scale(15.0f); + ps->doSizeOverLT(false); + ps->doAlphaOverLT(true); + ps->doAlphaOverLTCurve(true); + ps->bezierStartEndPointAlpha()[1] = 0.0f; + ps->bezierControlPointAlpha()[1] = 0.5f; + ps->bezierControlPointAlpha()[2] = 0.5f; + ps->generateBernsteinPAlpha(); + ps->doRotRange(true); + ps->color(SLCol4f(1.0f, 1.0f, 1.0f, 1.0f)); + ps->doBlendBrightness(false); + ps->frameRateFB(16); + + SLMesh* pSMesh = ps; + SLNode* pSNode = new SLNode(pSMesh, "Particle system node fire2"); + pSNode->translate(3.0f, -0.8f, 0.0f, TS_object); + + scene->addChild(pSNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.h b/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.h new file mode 100644 index 00000000..c09e974b --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleDustStorm.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleDustStorm.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLEDUST_H +#define APPDEMOSCENEPARTICLEDUST_H + +#include + +//----------------------------------------------------------------------------- +//! Class for simple particle system scene +class AppDemoSceneParticleDustStorm : public SLScene +{ +public: + AppDemoSceneParticleDustStorm(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.cpp new file mode 100644 index 00000000..d4a61e1d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.cpp @@ -0,0 +1,82 @@ +/** + * \file AppDemoSceneParticleFountain.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleFountain::AppDemoSceneParticleFountain() + : SLScene("Fountain Particle System") +{ + info("This fountain particle system uses acceleration and gravity.\n" + "See the properties window for the detailed settings of the particles system"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleFountain::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleCircle_05_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleFountain::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create and add camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, -1, 55); + cam1->lookAt(0, -1, 0); + cam1->focalDist(55); + scene->addChild(cam1); + sv->camera(cam1); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->translation(0, -1, 2); + light1->name("light node"); + scene->addChild(light1); + + // Create meshes and nodes + SLParticleSystem* ps = new SLParticleSystem(am, + 5000, + SLVec3f(5.0f, 15.0f, 5.0f), + SLVec3f(-5.0f, 17.0f, -5.0f), + 5.0f, + _texC, + "Fountain", + _texFlip); + SLMesh* pSMesh = ps; + ps->doGravity(true); + ps->color(SLCol4f(0.0039f, 0.14f, 0.86f, 0.33f)); + ps->doSizeOverLT(false); + ps->doAlphaOverLT(false); + SLNode* pSNode = new SLNode(pSMesh, "Particle system node"); + scene->addChild(pSNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.h b/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.h new file mode 100644 index 00000000..380c0b46 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleFountain.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleFountain.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLEFOUNTAIN_H +#define APPDEMOSCENEPARTICLEFOUNTAIN_H + +#include + +//----------------------------------------------------------------------------- +//! Class for simple particle system scene +class AppDemoSceneParticleFountain : public SLScene +{ +public: + AppDemoSceneParticleFountain(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleMany.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleMany.cpp new file mode 100644 index 00000000..ac710f9e --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleMany.cpp @@ -0,0 +1,83 @@ +/** + * \file AppDemoSceneParticleMany.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleMany::AppDemoSceneParticleMany() + : SLScene("Many Particle System") +{ + info("This particle system with 1 mio. particles uses the box shape " + "type for distribution. See the properties window for the detailed " + "settings of the particles system"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleMany::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleMany::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(1000); + cam1->translation(0, 0, 400); + cam1->focalDist(400); + cam1->lookAt(0, 0, 0); + cam1->background().colors(SLCol4f(0.3f, 0.3f, 0.3f)); + cam1->setInitialState(); + + // Root scene node + SLNode* root = new SLNode("Root scene node"); + root3D(root); + root->addChild(cam1); + + // Create meshes and nodes + SLParticleSystem* ps = new SLParticleSystem(am, + 1000000, + SLVec3f(-10.0f, -10.0f, -10.0f), + SLVec3f(10.0f, 10.0f, 10.0f), + 4.0f, + _texC, + "Particle System", + _texFlip); + ps->doAlphaOverLT(false); + ps->doSizeOverLT(false); + ps->doRotation(false); + ps->doShape(true); + ps->shapeType(ST_Box); + ps->shapeScale(100.0f, 100.0f, 100.0f); + ps->doDirectionSpeed(true); + ps->doBlendBrightness(true); + ps->doColor(true); + ps->color(SLCol4f(0.875f, 0.156f, 0.875f, 1.0f)); + ps->speed(0.0f); + SLMesh* pSMesh = ps; + SLNode* pSNode = new SLNode(pSMesh, "Particle system node"); + root->addChild(pSNode); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleMany.h b/apps/app_demo/source/scenes/AppDemoSceneParticleMany.h new file mode 100644 index 00000000..a1b5890d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleMany.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleMany.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLEMANY_H +#define APPDEMOSCENEPARTICLEMANY_H + +#include + +//----------------------------------------------------------------------------- +//! Class for benchmark scene with a particle system with many particles +class AppDemoSceneParticleMany : public SLScene +{ +public: + AppDemoSceneParticleMany(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.cpp new file mode 100644 index 00000000..070ca8c2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.cpp @@ -0,0 +1,74 @@ +/** + * \file AppDemoSceneParticleRingOfFire.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleRingOfFire::AppDemoSceneParticleRingOfFire() + : SLScene("Ring of Fire Particle System") +{ + info("This ring particle system uses the cone shape type for distribution.\n" + "See the properties window for the settings of the particles system"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleRingOfFire::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleRingOfFire::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create meshes and nodes + SLParticleSystem* ps = new SLParticleSystem(am, + 1000, + SLVec3f(0.0f, 0.0f, 0.0f), + SLVec3f(0.0f, 0.0f, 0.0f), + 4.0f, + _texC, + "Ring of fire Particle System", + _texFlip); + + ps->doShape(true); + ps->shapeType(ST_Cone); + ps->doShapeSpawnBase(true); + ps->doShapeSurface(true); + ps->shapeRadius(1.0f); + ps->doBlendBrightness(true); + ps->color(SLCol4f(0.925f, 0.238f, 0.097f, 0.503f)); + + SLMesh* pSMesh = ps; + SLNode* pSNode = new SLNode(pSMesh, "Particle Ring Fire node"); + pSNode->rotate(90, 1, 0, 0); + scene->addChild(pSNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.h b/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.h new file mode 100644 index 00000000..0fcf91ef --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleRingOfFire.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleRingOfFire.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLERING_H +#define APPDEMOSCENEPARTICLERING_H + +#include + +//----------------------------------------------------------------------------- +//! Class for fire ring particle system scene +class AppDemoSceneParticleRingOfFire : public SLScene +{ +public: + AppDemoSceneParticleRingOfFire(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.cpp new file mode 100644 index 00000000..3fa9cbed --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.cpp @@ -0,0 +1,75 @@ +/** + * \file AppDemoSceneParticleSimple.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleSimple::AppDemoSceneParticleSimple() + : SLScene("Simple Particle System") +{ + info("Simple Particle System with a flip book smoke particle."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleSimple::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleSimple::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create and add camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 1.5f, 4); + cam1->lookAt(0, 1.5f, 0); + scene->addChild(cam1); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->translation(5, 5, 5); + light1->name("light node"); + scene->addChild(light1); + + // Create meshes and nodes + SLParticleSystem* ps = new SLParticleSystem(am, + 50, + SLVec3f(0.04f, 0.4f, 0.1f), + SLVec3f(-0.11f, 0.7f, -0.1f), + 4.0f, + _texC, + "Particle System", + _texFlip); + + SLNode* pSNode = new SLNode(ps, "Particle system node"); + scene->addChild(pSNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.h b/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.h new file mode 100644 index 00000000..219b8f17 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleSimple.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleSimple.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLESIMPLE_H +#define APPDEMOSCENEPARTICLESIMPLE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for simple particle system scene +class AppDemoSceneParticleSimple : public SLScene +{ +public: + AppDemoSceneParticleSimple(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleSun.cpp b/apps/app_demo/source/scenes/AppDemoSceneParticleSun.cpp new file mode 100644 index 00000000..59bb3739 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleSun.cpp @@ -0,0 +1,71 @@ +/** + * \file AppDemoSceneParticleSun.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneParticleSun::AppDemoSceneParticleSun() + : SLScene("Sun Particle System") +{ + info("This sun particle system uses the sphere shape type for distribution.\n" + "See the properties window for the detailed settings of the particles system"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneParticleSun::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "ParticleSmoke_08_C.png"); + al.addTextureToLoad(_texFlip, + AppCommon::texturePath + + "ParticleSmoke_03_8x8_C.png"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneParticleSun::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create meshes and nodes + SLParticleSystem* ps = new SLParticleSystem(am, + 10000, + SLVec3f(0.0f, 0.0f, 0.0f), + SLVec3f(0.0f, 0.0f, 0.0f), + 4.0f, + _texC, + "Sun Particle System", + _texFlip); + + ps->doShape(true); + ps->shapeType(ST_Sphere); + ps->shapeRadius(3.0f); + ps->doBlendBrightness(true); + ps->color(SLCol4f(0.925f, 0.238f, 0.097f, 0.199f)); + + SLMesh* pSMesh = ps; + SLNode* pSNode = new SLNode(pSMesh, "Particle Sun node"); + scene->addChild(pSNode); + + // Set background color and the root scene node + sv->sceneViewCamera()->background().colors(SLCol4f(0.8f, 0.8f, 0.8f), + SLCol4f(0.2f, 0.2f, 0.2f)); + // Save energy + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneParticleSun.h b/apps/app_demo/source/scenes/AppDemoSceneParticleSun.h new file mode 100644 index 00000000..003d3ffb --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneParticleSun.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneParticleSun.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Particle System from Marc Affolter + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPARTICLESUN_H +#define APPDEMOSCENEPARTICLESUN_H + +#include + +//----------------------------------------------------------------------------- +//! Class for sun particle system scene +class AppDemoSceneParticleSun : public SLScene +{ +public: + AppDemoSceneParticleSun(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texFlip; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoScenePointClouds.cpp b/apps/app_demo/source/scenes/AppDemoScenePointClouds.cpp new file mode 100644 index 00000000..8800e48f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoScenePointClouds.cpp @@ -0,0 +1,79 @@ +/** + * \file AppDemoScenePointClouds.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoScenePointClouds::AppDemoScenePointClouds() : SLScene("Point Cloud Test Scene") +{ + info("Point Clouds with normal and uniform distribution. " + "You can select vertices with rectangle select (CTRL-LMB) " + "and deselect selected with ALT-LMB."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoScenePointClouds::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addProgramToLoad(_sp1, + AppCommon::shaderPath + "ColorUniformPoint.vert", + AppCommon::shaderPath + "Color.frag"); + al.addProgramToLoad(_sp2, + AppCommon::shaderPath + "ColorUniformPoint.vert", + AppCommon::shaderPath + "Color.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoScenePointClouds::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 0, 15); + cam1->lookAt(0, 0, 0); + cam1->focalDist(15); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light1 = new SLLightSpot(am, this, 10, 10, 10, 0.3f); + light1->powers(0.2f, 0.8f, 1.0f); + light1->attenuation(1, 0, 0); + + SLMaterial* pcMat1 = new SLMaterial(am, "Red", SLCol4f::RED); + pcMat1->program(_sp1); + pcMat1->program()->addUniform1f(new SLGLUniform1f(UT_const, "u_pointSize", 4.0f)); + SLRnd3fNormal rndN(SLVec3f(0, 0, 0), SLVec3f(5, 2, 1)); + SLNode* pc1 = new SLNode(new SLPoints(am, 1000, rndN, "PC1", pcMat1)); + pc1->translate(-5, 0, 0); + + SLMaterial* pcMat2 = new SLMaterial(am, "Green", SLCol4f::GREEN); + pcMat2->program(_sp2); + pcMat2->program()->addUniform1f(new SLGLUniform1f(UT_const, "u_pointSize", 1.0f)); + SLRnd3fUniform rndU(SLVec3f(0, 0, 0), SLVec3f(2, 3, 5)); + SLNode* pc2 = new SLNode(new SLPoints(am, 1000, rndU, "PC2", pcMat2)); + pc2->translate(5, 0, 0); + + SLNode* scene = new SLNode("scene"); + this->root3D(scene); + scene->addChild(cam1); + scene->addChild(light1); + scene->addChild(pc1); + scene->addChild(pc2); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoScenePointClouds.h b/apps/app_demo/source/scenes/AppDemoScenePointClouds.h new file mode 100644 index 00000000..34a43221 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoScenePointClouds.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoScenePointClouds.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEPOINTCLOUDS_H +#define APPDEMOSCENEPOINTCLOUDS_H + +#include + +//----------------------------------------------------------------------------- +//! Class for point cloud test scene +class AppDemoScenePointClouds : public SLScene +{ +public: + AppDemoScenePointClouds(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLProgram* _sp1; + SLGLProgram* _sp2; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTDoF.cpp b/apps/app_demo/source/scenes/AppDemoSceneRTDoF.cpp new file mode 100644 index 00000000..1d6868b2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTDoF.cpp @@ -0,0 +1,130 @@ +/** + * \file AppDemoSceneRTDoF.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRTDoF::AppDemoSceneRTDoF() + : SLScene("Depth of Field Ray Tracing") +{ + info("Muttenzer Box with environment mapped reflective sphere and " + "transparent refractive glass sphere. Try ray tracing for real " + "reflections and soft shadows."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRTDoF::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_tex1, + AppCommon::texturePath + + "Checkerboard0512_C.png", + SL_ANISOTROPY_MAX, + GL_LINEAR); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRTDoF::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create root node + SLNode* scene = new SLNode; + root3D(scene); + + // Create textures and materials + SLMaterial* mT = new SLMaterial(am, "mT", _tex1); + mT->kr(0.5f); + SLMaterial* mW = new SLMaterial(am, "mW", SLCol4f::WHITE); + SLMaterial* mB = new SLMaterial(am, "mB", SLCol4f::GRAY); + SLMaterial* mY = new SLMaterial(am, "mY", SLCol4f::YELLOW); + SLMaterial* mR = new SLMaterial(am, "mR", SLCol4f::RED); + SLMaterial* mG = new SLMaterial(am, "mG", SLCol4f::GREEN); + SLMaterial* mM = new SLMaterial(am, "mM", SLCol4f::MAGENTA); + +#ifndef SL_GLES + SLuint numSamples = 10; +#else + SLuint numSamples = 4; +#endif + + stringstream ss; + ss << "Ray tracing with depth of field blur. Each pixel is sampled " + << numSamples * numSamples + << "x from a lens. Be patient on mobile devices."; + + info(ss.str()); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 7); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->clipFar(80); + cam1->lensDiameter(0.4f); + cam1->lensSamples()->samples(numSamples, numSamples); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->fogIsOn(true); + cam1->fogMode(FM_exp); + cam1->fogDensity(0.04f); + scene->addChild(cam1); + + SLuint res = 36; + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-40, -10), + SLVec2f(40, 70), + SLVec2f(0, 0), + SLVec2f(4, 4), + 2, + 2, + "Rect", + mT)); + rect->rotate(90, -1, 0, 0); + rect->translate(0, 0, -0.5f, TS_object); + scene->addChild(rect); + + SLLightSpot* light1 = new SLLightSpot(am, this, 2, 2, 0, 0.1f); + light1->ambiDiffPowers(0.1f, 1); + light1->attenuation(1, 0, 0); + scene->addChild(light1); + + SLNode* balls = new SLNode; + SLNode* sp; + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S1", mW)); + sp->translate(2.0, 0, -4, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S2", mB)); + sp->translate(1.5, 0, -3, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S3", mY)); + sp->translate(1.0, 0, -2, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S4", mR)); + sp->translate(0.5, 0, -1, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S5", mG)); + sp->translate(0.0, 0, 0, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S6", mM)); + sp->translate(-0.5, 0, 1, TS_object); + balls->addChild(sp); + sp = new SLNode(new SLSphere(am, 0.5f, res, res, "S7", mW)); + sp->translate(-1.0, 0, 2, TS_object); + balls->addChild(sp); + scene->addChild(balls); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- \ No newline at end of file diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTDoF.h b/apps/app_demo/source/scenes/AppDemoSceneRTDoF.h new file mode 100644 index 00000000..b5f2f2f3 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTDoF.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneRTDoF.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENERTDOF_H +#define APPDEMOSCENERTDOF_H + +#include + +//----------------------------------------------------------------------------- +//! Class for the depth of field ray tracing scene +class AppDemoSceneRTDoF : public SLScene +{ +public: + AppDemoSceneRTDoF(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex1; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTLens.cpp b/apps/app_demo/source/scenes/AppDemoSceneRTLens.cpp new file mode 100644 index 00000000..23eccf29 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTLens.cpp @@ -0,0 +1,128 @@ +/** + * \file AppDemoSceneRTLens.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include "SLLightDirect.h" +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRTLens::AppDemoSceneRTLens() + : SLScene("Ray tracing through a lens") +{ + info("Ray tracing through a lens"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRTLens::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_tex1, + AppCommon::texturePath + + "VisionExample.jpg"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRTLens::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create root node + SLNode* scene = new SLNode; + root3D(scene); + + // Create textures and materials + SLMaterial* mT = new SLMaterial(am, + "mT", + _tex1, + nullptr, + nullptr, + nullptr); + mT->kr(0.5f); + + // Glass material + SLMaterial* matLens = new SLMaterial(am, + "lens", + SLCol4f(0.0f, 0.0f, 0.0f), + SLCol4f(0.5f, 0.5f, 0.5f), + 100, + 0.5f, + 0.5f, + 1.5f); + // SLGLShaderProg* sp1 = new SLGLShaderProgGeneric("RefractReflect.vert", "RefractReflect.frag"); + // matLens->shaderProg(sp1); + +#ifndef APP_USES_GLES + SLuint numSamples = 10; +#else + SLuint numSamples = 6; +#endif + + // Scene + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 8, 0); + cam1->lookAt(0, 0, 0); + cam1->focalDist(6); + cam1->lensDiameter(0.4f); + cam1->lensSamples()->samples(numSamples, numSamples); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + SLLightDirect* light1 = new SLLightDirect(am, this); + light1->translation(1, 1,1); + light1->lookAt(0, 0, 0); + scene->addChild(light1); + + SLuint res = 20; + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-5, -5), + SLVec2f(5, 5), + res, + res, + "Rect", + mT)); + rect->rotate(90, -1, 0, 0); + rect->translate(0, 0, -0.0f, TS_object); + scene->addChild(rect); + + // Lens from eye prescription card + // SLNode* lensA = new SLNode(new SLLens(s, 0.50f, -0.50f, 4.0f, 0.0f, 32, 32, "presbyopic", matLens)); // Weitsichtig + // lensA->translate(-2, 1, -2); + // scene->addChild(lensA); + + // SLNode* lensB = new SLNode(new SLLens(s, -0.65f, -0.10f, 4.0f, 0.0f, 32, 32, "myopic", matLens)); // Kurzsichtig + // lensB->translate(2, 1, -2); + // scene->addChild(lensB); + + // Lens with radius + // SLNode* lensC = new SLNode(new SLLens(s, 5.0, 4.0, 4.0f, 0.0f, 32, 32, "presbyopic", matLens)); // Weitsichtig + // lensC->translate(-2, 1, 2); + // scene->addChild(lensC); + + SLNode* lensD = new SLNode(new SLLens(am, + -15.0f, + -15.0f, + 1.0f, + 0.1f, + res, + res, + "myopic", + matLens)); // Kurzsichtig + lensD->translate(0, 6, 0); + scene->addChild(lensD); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- \ No newline at end of file diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTLens.h b/apps/app_demo/source/scenes/AppDemoSceneRTLens.h new file mode 100644 index 00000000..20ea7475 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTLens.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneRTLens.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENERTLENS_H +#define APPDEMOSCENERTLENS_H + +#include + +//----------------------------------------------------------------------------- +//! Class for the lens ray tracing scene +class AppDemoSceneRTLens : public SLScene +{ +public: + AppDemoSceneRTLens(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex1; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.cpp b/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.cpp new file mode 100644 index 00000000..348896eb --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.cpp @@ -0,0 +1,222 @@ +/** + * \file AppDemoSceneRTMuttenzerBox.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRTMuttenzerBox::AppDemoSceneRTMuttenzerBox() + : SLScene("Muttenzer Box Ray Tracing") +{ + info("Muttenzer Box with environment mapped reflective sphere and " + "transparent refractive glass sphere. Try ray tracing for real " + "reflections and soft shadows."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRTMuttenzerBox::registerAssetsToLoad(SLAssetLoader& al) +{ + + al.addTextureToLoad(_tex1, + AppCommon::texturePath + "MuttenzerBox+X0512_C.png", + AppCommon::texturePath + "MuttenzerBox-X0512_C.png", + AppCommon::texturePath + "MuttenzerBox+Y0512_C.png", + AppCommon::texturePath + "MuttenzerBox-Y0512_C.png", + AppCommon::texturePath + "MuttenzerBox+Z0512_C.png", + AppCommon::texturePath + "MuttenzerBox-Z0512_C.png"); + + al.addProgramToLoad(_spRefl, + AppCommon::shaderPath + "Reflect.vert", + AppCommon::shaderPath + "Reflect.frag"); + al.addProgramToLoad(_spRefr, + AppCommon::shaderPath + "RefractReflect.vert", + AppCommon::shaderPath + "RefractReflect.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRTMuttenzerBox::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + SLCol4f lightEmisRGB(7.0f, 7.0f, 7.0f); + SLCol4f grayRGB(0.75f, 0.75f, 0.75f); + SLCol4f redRGB(0.75f, 0.25f, 0.25f); + SLCol4f blueRGB(0.25f, 0.25f, 0.75f); + SLCol4f blackRGB(0.00f, 0.00f, 0.00f); + + // create materials + SLMaterial* cream = new SLMaterial(am, + "cream", + grayRGB, + SLCol4f::BLACK, + 0); + SLMaterial* red = new SLMaterial(am, + "red", + redRGB, + SLCol4f::BLACK, + 0); + SLMaterial* blue = new SLMaterial(am, + "blue", + blueRGB, + SLCol4f::BLACK, + 0); + + // Material for mirror sphere + SLMaterial* refl = new SLMaterial(am, + "refl", + blackRGB, + SLCol4f::WHITE, + 1000, + 1.0f); + refl->addTexture(_tex1); + refl->program(_spRefl); + + // Material for glass sphere + SLMaterial* refr = new SLMaterial(am, + "refr", + blackRGB, + blackRGB, + 100, + 0.05f, + 0.95f, + 1.5f); + refr->translucency(1000); + refr->transmissive(SLCol4f::WHITE); + refr->addTexture(_tex1); + refr->program(_spRefr); + + SLNode* sphere1 = new SLNode(new SLSphere(am, + 0.5f, + 32, + 32, + "Sphere1", + refl)); + sphere1->translate(-0.65f, -0.75f, -0.55f, TS_object); + + SLNode* sphere2 = new SLNode(new SLSphere(am, + 0.45f, + 32, + 32, + "Sphere2", + refr)); + sphere2->translate(0.73f, -0.8f, 0.10f, TS_object); + + SLNode* balls = new SLNode; + balls->addChild(sphere1); + balls->addChild(sphere2); + + // Rectangular light + SLLightRect* lightRect = new SLLightRect(am, + this, + 1, + 0.65f); + lightRect->rotate(90, -1.0f, 0.0f, 0.0f); + lightRect->translate(0.0f, -0.25f, 1.18f, TS_object); + lightRect->spotCutOffDEG(90); + lightRect->spotExponent(1.0); + lightRect->ambientColor(SLCol4f::WHITE); + lightRect->ambientPower(0.25f); + lightRect->diffuseColor(lightEmisRGB); + lightRect->attenuation(0, 0, 1); + lightRect->samplesXY(11, 7); + lightRect->createsShadows(true); + lightRect->createShadowMap(); + + SLLight::globalAmbient.set(lightEmisRGB * 0.01f); + + // create camera + SLCamera* cam1 = new SLCamera(); + cam1->translation(0, 0, 7.2f); + cam1->fov(27); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.0f, 0.0f, 0.0f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // assemble scene + SLNode* scene = new SLNode; + root3D(scene); + scene->addChild(cam1); + scene->addChild(lightRect); + + // create wall polygons + SLfloat pL = -1.48f, pR = 1.48f; // left/right + SLfloat pB = -1.25f, pT = 1.19f; // bottom/top + SLfloat pN = 1.79f, pF = -1.55f; // near/far + + // bottom plane + SLNode* b = new SLNode(new SLRectangle(am, + SLVec2f(pL, -pN), + SLVec2f(pR, -pF), + 6, + 6, + "bottom", + cream)); + b->rotate(90, -1, 0, 0); + b->translate(0, 0, pB, TS_object); + scene->addChild(b); + + // top plane + SLNode* t = new SLNode(new SLRectangle(am, + SLVec2f(pL, pF), + SLVec2f(pR, pN), + 6, + 6, + "top", + cream)); + t->rotate(90, 1, 0, 0); + t->translate(0, 0, -pT, TS_object); + scene->addChild(t); + + // far plane + SLNode* f = new SLNode(new SLRectangle(am, + SLVec2f(pL, pB), + SLVec2f(pR, pT), + 6, + 6, + "far", + cream)); + f->translate(0, 0, pF, TS_object); + scene->addChild(f); + + // left plane + SLNode* l = new SLNode(new SLRectangle(am, + SLVec2f(-pN, pB), + SLVec2f(-pF, pT), + 6, + 6, + "left", + red)); + l->rotate(90, 0, 1, 0); + l->translate(0, 0, pL, TS_object); + scene->addChild(l); + + // right plane + SLNode* r = new SLNode(new SLRectangle(am, + SLVec2f(pF, pB), + SLVec2f(pN, pT), + 6, + 6, + "right", + blue)); + r->rotate(90, 0, -1, 0); + r->translate(0, 0, -pR, TS_object); + scene->addChild(r); + + scene->addChild(balls); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- \ No newline at end of file diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.h b/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.h new file mode 100644 index 00000000..ece92d3a --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTMuttenzerBox.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneRTMuttenzerBox.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENERTMUTTENZ_H +#define APPDEMOSCENERTMUTTENZ_H + +#include + +//----------------------------------------------------------------------------- +//! Class for the Muttenzer Box ray tracing scene +class AppDemoSceneRTMuttenzerBox : public SLScene +{ +public: + AppDemoSceneRTMuttenzerBox(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex1; + SLGLProgram* _spRefl; + SLGLProgram* _spRefr; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.cpp b/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.cpp new file mode 100644 index 00000000..6ce0187a --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.cpp @@ -0,0 +1,260 @@ +/** + * \file AppDemoSceneRTSpheres.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRTSpheres::AppDemoSceneRTSpheres(SLSceneID sceneID) + : SLScene("Ray Tracing Spheres"), + _sceneID(sceneID) +{ + if (sceneID == SID_RTSpheres) + info("Classic ray tracing scene with transparent and reflective " + "spheres. Be patient on mobile devices."); + else if (sceneID == SID_RTSoftShadows) + { + name("Soft Shadow Ray Tracing"); + info("Ray tracing with soft shadow light sampling. Each light " + "source is sampled 64x per pixel. Be patient on mobile devices."); + } + else + SL_EXIT_MSG("Should not get here!"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRTSpheres::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRTSpheres::assemble(SLAssetManager* am, SLSceneView* sv) +{ + + if (_sceneID == SID_RTSpheres) + { + // define materials + SLMaterial* matGla = new SLMaterial(am, + "Glass", + SLCol4f(0.0f, 0.0f, 0.0f), + SLCol4f(0.5f, 0.5f, 0.5f), + 100, + 0.4f, + 0.6f, + 1.5f); + SLMaterial* matRed = new SLMaterial(am, + "Red", + SLCol4f(0.5f, 0.0f, 0.0f), + SLCol4f(0.5f, 0.5f, 0.5f), + 100, + 0.5f, + 0.0f, + 1.0f); + SLMaterial* matYel = new SLMaterial(am, + "Floor", + SLCol4f(0.8f, 0.6f, 0.2f), + SLCol4f(0.8f, 0.8f, 0.8f), + 100, + 0.5f, + 0.0f, + 1.0f); + + SLCamera* cam1 = new SLCamera(); + cam1->translation(0, 0.1f, 2.5f); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-3, -3), + SLVec2f(5, 4), + 20, + 20, + "Floor", + matYel)); + rect->rotate(90, -1, 0, 0); + rect->translate(0, -1, -0.5f, TS_object); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 2, + 2, + 2, + 0.1f); + light1->powers(1, 7, 7); + light1->attenuation(0, 0, 1); + + SLLightSpot* light2 = new SLLightSpot(am, + this, + 2, + 2, + -2, + 0.1f); + light2->powers(1, 7, 7); + light2->attenuation(0, 0, 1); + + SLNode* scene = new SLNode; + sv->camera(cam1); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(SphereGroupRT(am, + 3, + 0, + 0, + 0, + 1, + 30, + matGla, + matRed)); + scene->addChild(rect); + scene->addChild(cam1); + + root3D(scene); + } + else if (_sceneID == SID_RTSoftShadows) + { + // Create root node + SLNode* scene = new SLNode; + root3D(scene); + + // define materials + SLCol4f spec(0.8f, 0.8f, 0.8f); + SLMaterial* matBlk = new SLMaterial(am, + "Glass", + SLCol4f(0.0f, 0.0f, 0.0f), + SLCol4f(0.5f, 0.5f, 0.5f), + 100, + 0.5f, + 0.5f, + 1.5f); + SLMaterial* matRed = new SLMaterial(am, + "Red", + SLCol4f(0.5f, 0.0f, 0.0f), + SLCol4f(0.5f, 0.5f, 0.5f), + 100, + 0.5f, + 0.0f, + 1.0f); + SLMaterial* matYel = new SLMaterial(am, + "Floor", + SLCol4f(0.8f, 0.6f, 0.2f), + SLCol4f(0.8f, 0.8f, 0.8f), + 100, + 0.0f, + 0.0f, + 1.0f); + + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 0.1f, 4); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + SLNode* rect = new SLNode(new SLRectangle(am, + SLVec2f(-5, -5), + SLVec2f(5, 5), + 1, + 1, + "Rect", + matYel)); + rect->rotate(90, -1, 0, 0); + rect->translate(0, 0, -0.5f); + rect->castsShadows(false); + scene->addChild(rect); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 2, + 2, + 2, + 0.3f); + light1->samples(8, 8); + light1->attenuation(0, 0, 1); + light1->createsShadows(true); + light1->createShadowMap(); + scene->addChild(light1); + + SLLightSpot* light2 = new SLLightSpot(am, + this, + 2, + 2, + -2, + 0.3f); + light2->samples(8, 8); + light2->attenuation(0, 0, 1); + light2->createsShadows(true); + light2->createShadowMap(); + scene->addChild(light2); + + scene->addChild(SphereGroupRT(am, + 1, + 0, + 0, + 0, + 1, + 32, + matBlk, + matRed)); + + sv->camera(cam1); + } +} +//----------------------------------------------------------------------------- +//! Creates a recursive sphere group used for the ray tracing scenes +SLNode* AppDemoSceneRTSpheres::SphereGroupRT(SLAssetManager* am, + SLint depth, + SLfloat x, + SLfloat y, + SLfloat z, + SLfloat scale, + SLuint resolution, + SLMaterial* matGlass, + SLMaterial* matRed) +{ + + SLstring name = matGlass->kt() > 0 ? "GlassSphere" : "RedSphere"; + if (depth == 0) + { + SLSphere* sphere = new SLSphere(am, 0.5f * scale, resolution, resolution, name, matRed); + SLNode* sphNode = new SLNode(sphere, "Sphere"); + sphNode->translate(x, y, z, TS_object); + return sphNode; + } + else + { + depth--; + SLNode* sGroup = new SLNode(new SLSphere(am, 0.5f * scale, resolution, resolution, name, matGlass), "SphereGroupRT"); + sGroup->translate(x, y, z, TS_object); + SLuint newRes = (SLuint)std::max((SLint)resolution - 4, 8); + sGroup->addChild(SphereGroupRT(am, depth, 0.643951f * scale, 0, 0.172546f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, 0.172546f * scale, 0, 0.643951f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, -0.471405f * scale, 0, 0.471405f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, -0.643951f * scale, 0, -0.172546f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, -0.172546f * scale, 0, -0.643951f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, 0.471405f * scale, 0, -0.471405f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, 0.272166f * scale, 0.544331f * scale, 0.272166f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, -0.371785f * scale, 0.544331f * scale, 0.099619f * scale, scale / 3, newRes, matRed, matRed)); + sGroup->addChild(SphereGroupRT(am, depth, 0.099619f * scale, 0.544331f * scale, -0.371785f * scale, scale / 3, newRes, matRed, matRed)); + return sGroup; + } +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.h b/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.h new file mode 100644 index 00000000..873eb88d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRTSpheres.h @@ -0,0 +1,40 @@ +/** + * \file AppDemoSceneRTSpheres.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENERTSPHERES_H +#define APPDEMOSCENERTSPHERES_H + +#include + +//----------------------------------------------------------------------------- +//! Class for the ray tracing scene with sphere group +class AppDemoSceneRTSpheres : public SLScene +{ +public: + AppDemoSceneRTSpheres(SLSceneID sceneID); + void registerAssetsToLoad(SLAssetLoader& al) override; + void assemble(SLAssetManager* am, SLSceneView* sv) override; + SLNode* SphereGroupRT(SLAssetManager* am, + SLint depth, + SLfloat x, + SLfloat y, + SLfloat z, + SLfloat scale, + SLuint resolution, + SLMaterial* matGlass, + SLMaterial* matRed); +private: + SLSceneID _sceneID; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRevolver.cpp b/apps/app_demo/source/scenes/AppDemoSceneRevolver.cpp new file mode 100644 index 00000000..9f7353c4 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRevolver.cpp @@ -0,0 +1,240 @@ +/** + * \file AppDemoSceneRevolver.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRevolver::AppDemoSceneRevolver() : SLScene("Revolving Mesh Test") +{ + info("Examples of revolving mesh objects constructed by rotating a 2D curve. " + "The glass shader reflects and refracts the environment map. " + "Try ray tracing with key R and come back with the ESC key."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRevolver::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_tex1C, + AppCommon::texturePath + + "Testmap_1024_C.jpg"); + al.addTextureToLoad(_tex1N, + AppCommon::texturePath + + "Testmap_1024_N.jpg"); + al.addTextureToLoad(_tex2, + AppCommon::texturePath + + "wood0_0512_C.jpg"); + al.addTextureToLoad(_tex3, + AppCommon::texturePath + + "bricks1_0256_C.jpg"); + al.addTextureToLoad(_tex4, + AppCommon::texturePath + + "wood2_0512_C.jpg"); + al.addTextureToLoad(_tex5, + AppCommon::texturePath + "wood2_0256_C.jpg", + AppCommon::texturePath + "wood2_0256_C.jpg", + AppCommon::texturePath + "gray_0256_C.jpg", + AppCommon::texturePath + "wood0_0256_C.jpg", + AppCommon::texturePath + "gray_0256_C.jpg", + AppCommon::texturePath + "bricks1_0256_C.jpg"); + al.addProgramToLoad(_sp1, + AppCommon::shaderPath + "RefractReflect.vert", + AppCommon::shaderPath + "RefractReflect.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRevolver::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Test map material + SLMaterial* mat1 = new SLMaterial(am, "mat1", _tex1C, _tex1N); + + // floor material + SLMaterial* mat2 = new SLMaterial(am, "mat2", _tex2); + mat2->specular(SLCol4f::BLACK); + + // Back wall material + SLMaterial* mat3 = new SLMaterial(am, "mat3", _tex3); + mat3->specular(SLCol4f::BLACK); + + // Left wall material + SLMaterial* mat4 = new SLMaterial(am, "mat4", _tex4); + mat4->specular(SLCol4f::BLACK); + + // Glass material + SLMaterial* mat5 = new SLMaterial(am, + "glass", + SLCol4f::BLACK, + SLCol4f::WHITE, + 255, + 0.1f, + 0.9f, + 1.5f); + mat5->addTexture(_tex5); + mat5->program(_sp1); + + // Wine material + SLMaterial* mat6 = new SLMaterial(am, + "wine", + SLCol4f(0.4f, 0.0f, 0.2f), + SLCol4f::BLACK, + 255, + 0.2f, + 0.7f, + 1.3f); + mat6->addTexture(_tex5); + mat6->program(_sp1); + + // camera + SLCamera* cam1 = new SLCamera(); + cam1->name("cam1"); + cam1->translation(0, 1, 17); + cam1->lookAt(0, 1, 0); + cam1->focalDist(17); + cam1->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // light + SLLightSpot* light1 = new SLLightSpot(am, + this, + 0, + 4, + 0, + 0.3f); + light1->powers(0.2f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + SLAnimation* anim = this->animManager().createNodeAnimation("light1_anim", 4.0f); + anim->createNodeAnimTrackForEllipse(light1, + 6.0f, + A_z, + 6.0f, + A_x); + + // glass 2D polygon definition for revolution + SLVVec3f revG; + revG.push_back(SLVec3f(0.00f, 0.00f)); // foot + revG.push_back(SLVec3f(2.00f, 0.00f)); + revG.push_back(SLVec3f(2.00f, 0.00f)); + revG.push_back(SLVec3f(2.00f, 0.10f)); + revG.push_back(SLVec3f(1.95f, 0.15f)); + revG.push_back(SLVec3f(0.40f, 0.50f)); // stand + revG.push_back(SLVec3f(0.25f, 0.60f)); + revG.push_back(SLVec3f(0.20f, 0.70f)); + revG.push_back(SLVec3f(0.30f, 3.00f)); + revG.push_back(SLVec3f(0.30f, 3.00f)); // crack + revG.push_back(SLVec3f(0.20f, 3.10f)); + revG.push_back(SLVec3f(0.20f, 3.10f)); + revG.push_back(SLVec3f(1.20f, 3.90f)); // outer cup + revG.push_back(SLVec3f(1.60f, 4.30f)); + revG.push_back(SLVec3f(1.95f, 4.80f)); + revG.push_back(SLVec3f(2.15f, 5.40f)); + revG.push_back(SLVec3f(2.20f, 6.20f)); + revG.push_back(SLVec3f(2.10f, 7.10f)); + revG.push_back(SLVec3f(2.05f, 7.15f)); + revG.push_back(SLVec3f(2.00f, 7.10f)); // inner cup + revG.push_back(SLVec3f(2.05f, 6.00f)); + SLuint res = 30; + SLNode* glass = new SLNode(new SLRevolver(am, + revG, + SLVec3f(0, 1, 0), + res, + true, + false, + "GlassRev", + mat5)); + glass->translate(0.0f, -3.5f, 0.0f, TS_object); + + // wine 2D polyline definition for revolution with two sided material + SLVVec3f revW; + revW.push_back(SLVec3f(0.00f, 3.82f)); + revW.push_back(SLVec3f(0.20f, 3.80f)); + revW.push_back(SLVec3f(0.80f, 4.00f)); + revW.push_back(SLVec3f(1.30f, 4.30f)); + revW.push_back(SLVec3f(1.70f, 4.80f)); + revW.push_back(SLVec3f(1.95f, 5.40f)); + revW.push_back(SLVec3f(2.05f, 6.00f)); + SLMesh* wineMesh = new SLRevolver(am, revW, SLVec3f(0, 1, 0), res, true, false, "WineRev", mat6); + wineMesh->matOut(mat5); + SLNode* wine = new SLNode(wineMesh); + wine->translate(0.0f, -3.5f, 0.0f, TS_object); + + // wine fluid top + SLNode* wineTop = new SLNode(new SLDisk(am, 2.05f, -SLVec3f::AXISY, res, false, "WineRevTop", mat6)); + wineTop->translate(0.0f, 2.5f, 0.0f, TS_object); + + // Other revolver objects + SLNode* sphere = new SLNode(new SLSphere(am, 1, 16, 16, "sphere", mat1)); + sphere->translate(3, 0, 0, TS_object); + SLNode* cylinder = new SLNode(new SLCylinder(am, 0.1f, 7, 3, 16, true, true, "cylinder", mat1)); + cylinder->translate(0, 0.5f, 0); + cylinder->rotate(90, -1, 0, 0); + cylinder->rotate(30, 0, 1, 0); + SLNode* cone = new SLNode(new SLCone(am, 1, 3, 3, 16, true, "cone", mat1)); + cone->translate(-3, -1, 0, TS_object); + cone->rotate(90, -1, 0, 0); + + // Cube dimensions + SLfloat pL = -9.0f, pR = 9.0f; // left/right + SLfloat pB = -3.5f, pT = 14.5f; // bottom/top + SLfloat pN = 9.0f, pF = -9.0f; // near/far + + // bottom rectangle + SLNode* b = new SLNode(new SLRectangle(am, SLVec2f(pL, -pN), SLVec2f(pR, -pF), 10, 10, "PolygonFloor", mat2)); + b->rotate(90, -1, 0, 0); + b->translate(0, 0, pB, TS_object); + + // top rectangle + SLNode* t = new SLNode(new SLRectangle(am, SLVec2f(pL, pF), SLVec2f(pR, pN), 10, 10, "top", mat2)); + t->rotate(90, 1, 0, 0); + t->translate(0, 0, -pT, TS_object); + + // far rectangle + SLNode* f = new SLNode(new SLRectangle(am, SLVec2f(pL, pB), SLVec2f(pR, pT), 10, 10, "far", mat3)); + f->translate(0, 0, pF, TS_object); + + // left rectangle + SLNode* l = new SLNode(new SLRectangle(am, SLVec2f(-pN, pB), SLVec2f(-pF, pT), 10, 10, "left", mat4)); + l->rotate(90, 0, 1, 0); + l->translate(0, 0, pL, TS_object); + + // right rectangle + SLNode* r = new SLNode(new SLRectangle(am, SLVec2f(pF, pB), SLVec2f(pN, pT), 10, 10, "right", mat4)); + r->rotate(90, 0, -1, 0); + r->translate(0, 0, -pR, TS_object); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(light1); + scene->addChild(glass); + scene->addChild(wine); + scene->addChild(wineTop); + scene->addChild(sphere); + scene->addChild(cylinder); + scene->addChild(cone); + scene->addChild(b); + scene->addChild(f); + scene->addChild(t); + scene->addChild(l); + scene->addChild(r); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneRevolver.h b/apps/app_demo/source/scenes/AppDemoSceneRevolver.h new file mode 100644 index 00000000..375bde32 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRevolver.h @@ -0,0 +1,55 @@ +/** + * \file AppDemoSceneRevolver.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEREVOLVER_H +#define APPDEMOSCENEREVOLVER_H + +#include + +//----------------------------------------------------------------------------- +//! Class for revolver mesh scene +class AppDemoSceneRevolver : public SLScene +{ +public: + AppDemoSceneRevolver(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _tex1C; + SLGLTexture* _tex1N; + SLGLTexture* _tex2; + SLGLTexture* _tex3; + SLGLTexture* _tex4; + SLGLTexture* _tex5; + SLGLProgram* _sp1; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneRobot.cpp b/apps/app_demo/source/scenes/AppDemoSceneRobot.cpp new file mode 100644 index 00000000..1a5a50ad --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRobot.cpp @@ -0,0 +1,172 @@ +/** + * \file AppDemoSceneRobot.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneRobot::AppDemoSceneRobot() + : SLScene("GLTF File Demo Scene") +{ + info("Fanuc-CRX Robot with forward kinematic movement."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneRobot::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_robot, + AppCommon::modelPath + + "GLTF/FanucCRX/Fanuc-CRX.gltf"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneRobot::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + // Create camera and initialize its parameters + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0.5f, 2.0f); + cam1->lookAt(0, 0.5f, 0); + cam1->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->focalDist(2); + cam1->setInitialState(); + scene->addChild(cam1); + + // Define directional + SLLightDirect* light1 = new SLLightDirect(am, + this, + 2, + 2, + 2, + 0.2f, + 0.6f, + 0.8f, + 1); + light1->lookAt(0, 0, 0); + light1->attenuation(1, 0, 0); + light1->createsShadows(true); + light1->createShadowMap(1, + 7, + SLVec2f(5, 5), + SLVec2i(2048, 2048)); + light1->doSmoothShadows(true); + light1->castsShadows(false); + scene->addChild(light1); + + SLMaterial* matFloor = new SLMaterial(am, + "matFloor", + SLCol4f::WHITE * 0.5f); + matFloor->ambient(SLCol4f::WHITE * 0.3f); + SLMesh* rectangle = new SLRectangle(am, + SLVec2f(-2, -2), + SLVec2f(2, 2), + 1, + 1, + "rectangle", + matFloor); + SLNode* floorRect = new SLNode(rectangle); + floorRect->rotate(90, -1, 0, 0); + scene->addChild(floorRect); + + // Set missing specular color + _robot->updateMeshMat([](SLMaterial* m) { m->specular(SLCol4f::WHITE); }, + true); + + SLNode* crx_j1 = _robot->findChild("crx_j1"); + SLNode* crx_j2 = _robot->findChild("crx_j2"); + SLNode* crx_j3 = _robot->findChild("crx_j3"); + SLNode* crx_j4 = _robot->findChild("crx_j4"); + SLNode* crx_j5 = _robot->findChild("crx_j5"); + SLNode* crx_j6 = _robot->findChild("crx_j6"); + + SLfloat angleDEG = 45; + SLfloat durationSEC = 3.0f; + + SLAnimation* j1Anim = animManager().createNodeAnimation("j1Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j1Anim->createNodeAnimTrackForRotation3(crx_j1, + -angleDEG, + 0, + angleDEG, + crx_j1->axisYOS()); + + SLAnimation* j2Anim = animManager().createNodeAnimation("j2Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j2Anim->createNodeAnimTrackForRotation3(crx_j2, + -angleDEG, + 0, + angleDEG, + -crx_j2->axisZOS()); + + SLAnimation* j3Anim = animManager().createNodeAnimation("j3Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j3Anim->createNodeAnimTrackForRotation3(crx_j3, + angleDEG, + 0, + -angleDEG, + -crx_j3->axisZOS()); + + SLAnimation* j4Anim = animManager().createNodeAnimation("j4Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j4Anim->createNodeAnimTrackForRotation3(crx_j4, + -2 * angleDEG, + 0, + 2 * angleDEG, + crx_j4->axisXOS()); + + SLAnimation* j5Anim = animManager().createNodeAnimation("j5Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j5Anim->createNodeAnimTrackForRotation3(crx_j5, + -2 * angleDEG, + 0, + 2 * angleDEG, + -crx_j5->axisZOS()); + + SLAnimation* j6Anim = animManager().createNodeAnimation("j6Anim", + durationSEC, + true, + EC_inOutCubic, + AL_pingPongLoop); + j6Anim->createNodeAnimTrackForRotation3(crx_j6, + -2 * angleDEG, + 0, + 2 * angleDEG, + crx_j6->axisXOS()); + + scene->addChild(_robot); + + sv->camera(cam1); + sv->doWaitOnIdle(true); // Saves energy +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneRobot.h b/apps/app_demo/source/scenes/AppDemoSceneRobot.h new file mode 100644 index 00000000..bfcd7c83 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneRobot.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneRobot.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEROBOT_H +#define APPDEMOSCENEROBOT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for a robot arm test scene +class AppDemoSceneRobot : public SLScene +{ +public: + AppDemoSceneRobot(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _robot; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.cpp new file mode 100644 index 00000000..1ea5fcbc --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.cpp @@ -0,0 +1,173 @@ +/** + * \file AppDemoSceneShaderBlinn.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderBlinn::AppDemoSceneShaderBlinn(SLstring name, bool perVertex) + : SLScene(name), + _perVertex(perVertex) +{ + if (_perVertex) + { + info("Per-vertex lighting with Blinn-Phong reflection model. " + "The reflection of 5 light sources is calculated per vertex. " + "The green and the white light are attached to the camera, the others are in the scene. " + "The light calculation per vertex is the fastest but leads to artefacts with spot lights"); + } + else + { + info("Per-pixel lighting with Blinn-Phong reflection model. " + "The reflection of 5 light sources is calculated per pixel. " + "The light calculation is done in the fragment shader."); + } +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderBlinn::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "earth2048_C_Q95.jpg"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + + "earth2048_N.jpg"); + al.addTextureToLoad(_texH, + AppCommon::texturePath + + "earth2048_H.jpg"); + al.addProgramToLoad(_perVrtTm, + AppCommon::shaderPath + "PerVrtBlinnTm.vert", + AppCommon::shaderPath + "PerVrtBlinnTm.frag"); + al.addProgramToLoad(_perVrt, + AppCommon::shaderPath + "PerVrtBlinn.vert", + AppCommon::shaderPath + "PerVrtBlinn.frag"); + al.addProgramToLoad(_perPix, + AppCommon::shaderPath + "PerPixBlinnTmNm.vert", + AppCommon::shaderPath + "PerPixBlinnTmPm.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderBlinn::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* mL = nullptr; + SLMaterial* mM = nullptr; + SLMaterial* mR = nullptr; + + if (_perVertex) + { + mL = new SLMaterial(am, "mL", _texC, nullptr, nullptr, nullptr, _perVrtTm); + mM = new SLMaterial(am, "mM", _perVrt); + mR = new SLMaterial(am, "mR", _texC, nullptr, nullptr, nullptr, _perVrtTm); + } + else + { // per pixel + SLGLUniform1f* scale = new SLGLUniform1f(UT_const, "u_scale", 0.02f, 0.002f, 0, 1); + SLGLUniform1f* offset = new SLGLUniform1f(UT_const, "u_offset", -0.02f, 0.002f, -1, 1); + _perPix->addUniform1f(scale); + _perPix->addUniform1f(offset); + mL = new SLMaterial(am, "mL", _texC); + mM = new SLMaterial(am, "mM"); + mR = new SLMaterial(am, "mR", _texC, _texN, _texH, nullptr, _perPix); + } + + mM->shininess(500); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 7); + cam1->lookAt(0, 0, 0); + cam1->focalDist(7); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Define 5 light sources + + // A rectangular white light attached to the camera + SLLightRect* lightW = new SLLightRect(am, this, 2.0f, 1.0f); + lightW->ambiDiffPowers(0, 5); + lightW->translation(0, 2.5f, 0); + lightW->translation(0, 2.5f, -7); + lightW->rotate(-90, 1, 0, 0); + lightW->attenuation(0, 0, 1); + cam1->addChild(lightW); + + // A red point light from the front attached in the scene + SLLightSpot* lightR = new SLLightSpot(am, this, 0.1f); + lightR->ambientColor(SLCol4f(0, 0, 0)); + lightR->diffuseColor(SLCol4f(1, 0, 0)); + lightR->specularColor(SLCol4f(1, 0, 0)); + lightR->translation(0, 0, 2); + lightR->lookAt(0, 0, 0); + lightR->attenuation(0, 0, 1); + scene->addChild(lightR); + + // A green spot head light with 40 deg. spot angle from front right + SLLightSpot* lightG = new SLLightSpot(am, this, 0.1f, 20, true); + lightG->ambientColor(SLCol4f(0, 0, 0)); + lightG->diffuseColor(SLCol4f(0, 1, 0)); + lightG->specularColor(SLCol4f(0, 1, 0)); + lightG->translation(1.5f, 1, -5.5f); + lightG->lookAt(0, 0, -7); + lightG->attenuation(1, 0, 0); + cam1->addChild(lightG); + + // A blue spot light with 40 deg. spot angle from front left + SLLightSpot* lightB = new SLLightSpot(am, this, 0.1f, 20.0f, true); + lightB->ambientColor(SLCol4f(0, 0, 0)); + lightB->diffuseColor(SLCol4f(0, 0, 1)); + lightB->specularColor(SLCol4f(0, 0, 1)); + lightB->translation(-1.5f, 1.5f, 1.5f); + lightB->lookAt(0, 0, 0); + lightB->attenuation(1, 0, 0); + SLAnimation* light3Anim = animManager().createNodeAnimation("Ball3_anim", + 1.0f, + true, + EC_outQuad, + AL_pingPongLoop); + light3Anim->createNodeAnimTrackForTranslation(lightB, SLVec3f(0, -2, 0)); + scene->addChild(lightB); + + // A yellow directional light from the back-bottom + // Do constant attenuation for directional lights since it is infinitely far away + SLLightDirect* lightY = new SLLightDirect(am, this); + lightY->ambientColor(SLCol4f(0, 0, 0)); + lightY->diffuseColor(SLCol4f(1, 1, 0)); + lightY->specularColor(SLCol4f(1, 1, 0)); + lightY->translation(-1.5f, -1.5f, 1.5f); + lightY->lookAt(0, 0, 0); + lightY->attenuation(1, 0, 0); + scene->addChild(lightY); + + // Add some meshes to be lighted + SLNode* sphereL = new SLNode(new SLSpheric(am, 1.0f, 0.0f, 180.0f, 36, 36, "Sphere", mL)); + sphereL->translate(-2, 0, 0); + sphereL->rotate(90, -1, 0, 0); + SLNode* sphereM = new SLNode(new SLSpheric(am, 1.0f, 0.0f, 180.0f, 36, 36, "Sphere", mM)); + SLNode* sphereR = new SLNode(new SLSpheric(am, 1.0f, 0.0f, 180.0f, 36, 36, "Sphere", mR)); + sphereR->translate(2, 0, 0); + sphereR->rotate(90, -1, 0, 0); + + scene->addChild(sphereL); + scene->addChild(sphereM); + scene->addChild(sphereR); + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.h b/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.h new file mode 100644 index 00000000..47940fd4 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderBlinn.h @@ -0,0 +1,57 @@ +/** + * \file AppDemoSceneShaderBlinn.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERBLINN_H +#define APPDEMOSCENESHADERBLINN_H + +#include + +//----------------------------------------------------------------------------- +//! Class for demo scene with Blinn-Phong lighting per vertex or per pixel +class AppDemoSceneShaderBlinn : public SLScene +{ +public: + AppDemoSceneShaderBlinn(SLstring name, + bool perVertex); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + bool _perVertex; + SLGLProgram* _perVrtTm = nullptr; + SLGLProgram* _perVrt = nullptr; + SLGLProgram* _perPix = nullptr; + SLGLTexture* _texC = nullptr; + SLGLTexture* _texN = nullptr; + SLGLTexture* _texH = nullptr; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderBump.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderBump.cpp new file mode 100644 index 00000000..aaff8d99 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderBump.cpp @@ -0,0 +1,92 @@ +/** + * \file AppDemoSceneShaderBump.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderBump::AppDemoSceneShaderBump() + : SLScene("Normal Map Bump Mapping") +{ + info("Normal map bump mapping combined with a spot and a directional lighting."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderBump::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "brickwall0512_C.jpg"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + + "brickwall0512_N.jpg"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderBump::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create materials + SLMaterial* m1 = new SLMaterial(am, "m1", _texC, _texN); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(-10, 10, 10); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationWS().distance(SLVec3f::ZERO)); + cam1->background().colors(SLCol4f(0.5f, 0.5f, 0.5f)); + cam1->setInitialState(); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 0.3f, + 40, + true); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + light1->translation(0, 0, 5); + light1->lookAt(0, 0, 0); + + SLLightDirect* light2 = new SLLightDirect(am, this); + light2->ambientColor(SLCol4f(0, 0, 0)); + light2->diffuseColor(SLCol4f(1, 1, 0)); + light2->specularColor(SLCol4f(1, 1, 0)); + light2->translation(-5, -5, 5); + light2->lookAt(0, 0, 0); + light2->attenuation(1, 0, 0); + + SLAnimation* anim = this->animManager().createNodeAnimation("light1_anim", + 2.0f); + anim->createNodeAnimTrackForEllipse(light1, + 2.0f, + A_x, + 2.0f, + A_Y); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(new SLNode(new SLRectangle(am, + SLVec2f(-5, -5), + SLVec2f(5, 5), + 1, + 1, + "Rect", + m1))); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderBump.h b/apps/app_demo/source/scenes/AppDemoSceneShaderBump.h new file mode 100644 index 00000000..733134f9 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderBump.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneShaderBump.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERBUMP_H +#define APPDEMOSCENESHADERBUMP_H + +#include + +//----------------------------------------------------------------------------- +//! Class for normal map bump mapping +class AppDemoSceneShaderBump : public SLScene +{ +public: + AppDemoSceneShaderBump(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texN; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderCook.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderCook.cpp new file mode 100644 index 00000000..fd24e5e5 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderCook.cpp @@ -0,0 +1,184 @@ +/** + * \file AppDemoSceneShaderCook.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderCook::AppDemoSceneShaderCook() + : SLScene("Cook-Torrance Shading") +{ + info("Cook-Torrance reflection model. Left-Right: roughness 0.05-1, Top-Down: metallic: 1-0. " + "The center sphere has roughness and metallic encoded in textures. " + "The reflection model produces a more physically based light reflection " + "than the standard Blinn-Phong reflection model."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderCook::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "rusty-metal_2048_C.jpg"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + + "rusty-metal_2048_N.jpg"); + al.addTextureToLoad(_texM, + AppCommon::texturePath + + "rusty-metal_2048_M.jpg"); + al.addTextureToLoad(_texR, + AppCommon::texturePath + + "rusty-metal_2048_R.jpg"); + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "PerPixCook.vert", + AppCommon::shaderPath + "PerPixCook.frag"); + al.addProgramToLoad(_spTex, + AppCommon::shaderPath + "PerPixCookTm.vert", + AppCommon::shaderPath + "PerPixCookTm.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderCook::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 30); + cam1->lookAt(0, 0, 0); + cam1->background().colors(SLCol4f::BLACK); + cam1->focalDist(30); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + // Create spheres and materials with roughness & metallic values between 0 and 1 + const SLint nrRows = 7; + const SLint nrCols = 7; + SLfloat spacing = 2.5f; + SLfloat maxX = (float)(nrCols - 1) * spacing * 0.5f; + SLfloat maxY = (float)(nrRows - 1) * spacing * 0.5f; + SLfloat deltaR = 1.0f / (float)(nrRows - 1); + SLfloat deltaM = 1.0f / (float)(nrCols - 1); + + SLMaterial* mat[nrRows * nrCols]; + SLint i = 0; + SLfloat y = -maxY; + for (SLint m = 0; m < nrRows; ++m) + { + SLfloat x = -maxX; + for (SLint r = 0; r < nrCols; ++r) + { + if (m == nrRows / 2 && r == nrCols / 2) + { + // The center sphere has roughness and metallic encoded in textures + mat[i] = new SLMaterial(am, + "CookTorranceMatTex", + nullptr, + _texC, + _texN, + _texM, + _texR, + nullptr, + _spTex); + } + else + { + // Cook-Torrance material without textures + mat[i] = new SLMaterial(am, + "CookTorranceMat", + nullptr, + SLCol4f::RED * 0.5f, + Utils::clamp((float)r * deltaR, 0.05f, 1.0f), + (float)m * deltaM, + _sp); + } + + SLNode* node = new SLNode(new SLSpheric(am, 1.0f, 0.0f, 180.0f, 32, 32, "Sphere", mat[i])); + node->translate(x, y, 0); + scene->addChild(node); + x += spacing; + i++; + } + y += spacing; + } + + // Add 5 Lights: 2 point lights, 2 directional lights and 1 spotlight in the center. + SLLight::gamma = 2.2f; + SLLightSpot* light1 = new SLLightSpot(am, + this, + -maxX, + maxY, + maxY, + 0.2f, + 180, + 0, + 1000, + 1000); + light1->attenuation(0, 0, 1); + SLLightDirect* light2 = new SLLightDirect(am, + this, + maxX, + maxY, + maxY, + 0.5f, + 0, + 10, + 10); + light2->lookAt(0, 0, 0); + light2->attenuation(0, 0, 1); + SLLightSpot* light3 = new SLLightSpot(am, + this, + 0, + 0, + maxY, + 0.2f, + 36, + 0, + 1000, + 1000); + light3->attenuation(0, 0, 1); + SLLightDirect* light4 = new SLLightDirect(am, + this, + -maxX, + -maxY, + maxY, + 0.5f, + 0, + 10, + 10); + light4->lookAt(0, 0, 0); + light4->attenuation(0, 0, 1); + SLLightSpot* light5 = new SLLightSpot(am, + this, + maxX, + -maxY, + maxY, + 0.2f, + 180, + 0, + 1000, + 1000); + light5->attenuation(0, 0, 1); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(light3); + scene->addChild(light4); + scene->addChild(light5); + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderCook.h b/apps/app_demo/source/scenes/AppDemoSceneShaderCook.h new file mode 100644 index 00000000..98927199 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderCook.h @@ -0,0 +1,55 @@ +/** + * \file AppDemoSceneShaderCook.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERCOOK_H +#define APPDEMOSCENESHADERCOOK_H + +#include + +//----------------------------------------------------------------------------- +//! Class for Cook-Torrance lighting demo scene +class AppDemoSceneShaderCook : public SLScene +{ +public: + AppDemoSceneShaderCook(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLProgram* _sp; + SLGLProgram* _spTex; + SLGLTexture* _texC; + SLGLTexture* _texN; + SLGLTexture* _texM; + SLGLTexture* _texR; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.cpp new file mode 100644 index 00000000..11a0f4d7 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.cpp @@ -0,0 +1,127 @@ +/** + * \file AppDemoSceneShaderEarth.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + * \authors Marcus Hudritsch, Earth Shader from Markus Knecht + * \copyright http://opensource.org/licenses/GPL-3.0 +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderEarth::AppDemoSceneShaderEarth() + : SLScene("Earth Shader Test") +{ + info("Complex earth shader with 7 textures: day color, night color, " + "normal, height & gloss map of earth, color & alpha-map of clouds.\n" + "Use (SHIFT) & key X to change scale of the parallax mapping\n" + "Use (SHIFT) & key O to change offset of the parallax mapping"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderEarth::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "earth2048_C.png"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + + "earth2048_N.jpg"); + al.addTextureToLoad(_texH, + AppCommon::texturePath + + "earth2048_H.jpg"); + al.addTextureToLoad(_texG, + AppCommon::texturePath + + "earth2048_S.jpg"); + al.addTextureToLoad(_texNC, + AppCommon::texturePath + + "earthNight2048_C.jpg"); + al.addTextureToLoad(_texClC, + AppCommon::texturePath + + "earthCloud1024_alpha_C.png"); + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "PerPixBlinnTmNm.vert", + AppCommon::shaderPath + "PerPixBlinnTmNmEarth.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderEarth::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLGLUniform1f* scale = new SLGLUniform1f(UT_const, + "u_scale", + 0.02f, + 0.002f, + 0, + 1, + (SLKey)'X'); + SLGLUniform1f* offset = new SLGLUniform1f(UT_const, + "u_offset", + -0.02f, + 0.002f, + -1, + 1, + (SLKey)'O'); + this->eventHandlers().push_back(scale); + this->eventHandlers().push_back(offset); + _sp->addUniform1f(scale); + _sp->addUniform1f(offset); + + // Create materials + SLMaterial* matEarth = new SLMaterial(am, + "matEarth", + _texC, + _texN, + _texH, + _texG, + _sp); + matEarth->addTexture(_texClC); + matEarth->addTexture(_texNC); + matEarth->shininess(4000); + matEarth->program(_sp); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 4); + cam1->lookAt(0, 0, 0); + cam1->focalDist(4); + cam1->background().colors(SLCol4f(0, 0, 0)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* sun = new SLLightSpot(am, this); + sun->powers(0.0f, 1.0f, 0.2f); + sun->attenuation(1, 0, 0); + + SLAnimation* anim = this->animManager().createNodeAnimation("light1_anim", + 24.0f); + anim->createNodeAnimTrackForEllipse(sun, + 50.0f, + A_x, + 50.0f, + A_z); + + SLuint res = 30; + SLNode* earth = new SLNode(new SLSphere(am, + 1, + res, + res, + "Earth", + matEarth)); + earth->rotate(90, -1, 0, 0); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(sun); + scene->addChild(earth); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.h b/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.h new file mode 100644 index 00000000..34239b9c --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderEarth.h @@ -0,0 +1,56 @@ +/** + * \file AppDemoSceneShaderEarth.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADEREARTH_H +#define APPDEMOSCENESHADEREARTH_H + +#include + +//----------------------------------------------------------------------------- +//! Class for image base lighting demo scene +class AppDemoSceneShaderEarth : public SLScene +{ +public: + AppDemoSceneShaderEarth(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLProgram* _sp; + SLGLTexture* _texC; + SLGLTexture* _texN; + SLGLTexture* _texH; + SLGLTexture* _texG; + SLGLTexture* _texNC; + SLGLTexture* _texClC; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.cpp new file mode 100644 index 00000000..80b640e4 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.cpp @@ -0,0 +1,155 @@ +/** + * \file AppDemoSceneShaderIBL.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderIBL::AppDemoSceneShaderIBL() + : SLScene("Image Based Lighting Test Scene") +{ + info("Image-based lighting from skybox using high dynamic range images. " + "It uses the Cook-Torrance reflection model also to calculate the " + "ambient light part from the surrounding HDR skybox."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderIBL::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + + "gold-scuffed_2048_C.png"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + + "gold-scuffed_2048_N.png"); + al.addTextureToLoad(_texM, + AppCommon::texturePath + + "gold-scuffed_2048_M.png"); + al.addTextureToLoad(_texR, + AppCommon::texturePath + + "gold-scuffed_2048_R.png"); + al.addTextureToLoad(_texA, + AppCommon::texturePath + + "gold-scuffed_2048_A.png"); + + al.addSkyboxToLoad(_skybox, + al.texturePath() + "env_barce_rooftop.hdr", + SLVec2i(256, 256), + "HDR Skybox"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderIBL::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + // Create camera and initialize its parameters + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 30); + cam1->lookAt(0, 0, 0); + cam1->background().colors(SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->focalDist(30); + cam1->setInitialState(); + scene->addChild(cam1); + + // Add directional light with a position that corresponds roughly to the sun direction + SLLight::gamma = 2.2f; + SLLightDirect* light1 = new SLLightDirect(am, + this, + 4.0f, + .3f, + 2.0f, + 0.5f, + 0, + 1, + 1); + light1->lookAt(0, 0, 0); + light1->attenuation(1, 0, 0); + light1->createsShadows(true); + light1->createShadowMapAutoSize(cam1, SLVec2i(2048, 2048), 4); + light1->shadowMap()->cascadesFactor(30.0); + light1->doSmoothShadows(true); + light1->castsShadows(false); + light1->shadowMinBias(0.001f); + light1->shadowMaxBias(0.003f); + scene->addChild(light1); + + // Create spheres and materials with roughness & metallic values between 0 and 1 + const SLint nrRows = 7; + const SLint nrCols = 7; + SLfloat spacing = 2.5f; + SLfloat maxX = (float)((int)(nrCols / 2) * spacing); + SLfloat maxY = (float)((int)(nrRows / 2) * spacing); + SLfloat deltaR = 1.0f / (float)(nrRows - 1); + SLfloat deltaM = 1.0f / (float)(nrCols - 1); + + SLMaterial* mat[nrRows * nrCols]; + SLint i = 0; + SLfloat y = -maxY; + for (SLint m = 0; m < nrRows; ++m) + { + SLfloat x = -maxX; + for (SLint r = 0; r < nrCols; ++r) + { + if (m == nrRows / 2 && r == nrCols / 2) + { + // The center sphere has roughness and metallic encoded in textures + // and the prefiltered textures for IBL + mat[i] = new SLMaterial(am, + "IBLMatTex", + _skybox, + _texC, + _texN, + _texM, + _texR, + _texA); + } + else + { + // Cook-Torrance material with IBL but without textures + mat[i] = new SLMaterial(am, + "IBLMat", + _skybox, + SLCol4f::WHITE * 0.5f, + Utils::clamp((float)r * deltaR, 0.05f, 1.0f), + (float)m * deltaM); + } + + SLNode* node = new SLNode(new SLSpheric(am, + 1.0f, + 0.0f, + 180.0f, + 32, + 32, + "Sphere", + mat[i])); + node->translate(x, y, 0); + scene->addChild(node); + x += spacing; + i++; + } + y += spacing; + } + + sv->camera(cam1); + this->skybox(_skybox); + + // Save energy + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.h b/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.h new file mode 100644 index 00000000..fce51ff3 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderIBL.h @@ -0,0 +1,55 @@ +/** + * \file AppDemoSceneShaderIBL.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERIBL_H +#define APPDEMOSCENESHADERIBL_H + +#include + +//----------------------------------------------------------------------------- +//! Class for image base lighting demo scene +class AppDemoSceneShaderIBL : public SLScene +{ +public: + AppDemoSceneShaderIBL(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texN; + SLGLTexture* _texM; + SLGLTexture* _texR; + SLGLTexture* _texA; + SLSkybox* _skybox; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.cpp new file mode 100644 index 00000000..4b27b868 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.cpp @@ -0,0 +1,121 @@ +/** + * \file AppDemoSceneShaderParallax.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderParallax::AppDemoSceneShaderParallax() + : SLScene("Parallax Bump Mapping Test") +{ + info("Normal map parallax mapping with a spot and a directional light" + "Use X-Key to increment (decrement w. shift) parallax scale." + "Use O-Key to increment (decrement w. shift) parallax offset.\n"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderParallax::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texC, + AppCommon::texturePath + "brickwall0512_C.jpg"); + al.addTextureToLoad(_texN, + AppCommon::texturePath + "brickwall0512_N.jpg"); + al.addTextureToLoad(_texH, + AppCommon::texturePath + "brickwall0512_H.jpg"); + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "PerPixBlinnTmNm.vert", + AppCommon::shaderPath + "PerPixBlinnTmPm.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderParallax::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLGLUniform1f* scale = new SLGLUniform1f(UT_const, + "u_scale", + 0.04f, + 0.002f, + 0, + 1, + (SLKey)'X'); + SLGLUniform1f* offset = new SLGLUniform1f(UT_const, + "u_offset", + -0.03f, + 0.002f, + -1, + 1, + (SLKey)'O'); + this->eventHandlers().push_back(scale); + this->eventHandlers().push_back(offset); + _sp->addUniform1f(scale); + _sp->addUniform1f(offset); + + // Create materials + SLMaterial* m1 = new SLMaterial(am, + "mat1", + _texC, + _texN, + _texH, + nullptr, + _sp); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(-10, 10, 10); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationWS().distance(SLVec3f::ZERO)); + cam1->background().colors(SLCol4f(0.5f, 0.5f, 0.5f)); + cam1->setInitialState(); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 0.3f, + 40, + true); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + light1->translation(0, 0, 5); + light1->lookAt(0, 0, 0); + + SLLightDirect* light2 = new SLLightDirect(am, this); + light2->ambientColor(SLCol4f(0, 0, 0)); + light2->diffuseColor(SLCol4f(1, 1, 0)); + light2->specularColor(SLCol4f(1, 1, 0)); + light2->translation(-5, -5, 5); + light2->lookAt(0, 0, 0); + light2->attenuation(1, 0, 0); + + SLAnimation* anim = this->animManager().createNodeAnimation("light1_anim", 2.0f); + anim->createNodeAnimTrackForEllipse(light1, + 2.0f, + A_x, + 2.0f, + A_Y); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(light1); + scene->addChild(light2); + scene->addChild(new SLNode(new SLRectangle(am, + SLVec2f(-5, -5), + SLVec2f(5, 5), + 1, + 1, + "Rect", + m1))); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.h b/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.h new file mode 100644 index 00000000..5fe67606 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderParallax.h @@ -0,0 +1,53 @@ +/** + * \file AppDemoSceneShaderParallax.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERPARALLAX_H +#define APPDEMOSCENESHADERPARALLAX_H + +#include + +//----------------------------------------------------------------------------- +//! Class for parallax bump mapping demo scene +class AppDemoSceneShaderParallax : public SLScene +{ +public: + AppDemoSceneShaderParallax(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texC; + SLGLTexture* _texN; + SLGLTexture* _texH; + SLGLProgram* _sp; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.cpp new file mode 100644 index 00000000..d6303b9d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.cpp @@ -0,0 +1,129 @@ +/** + * \file AppDemoSceneShaderSkybox.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderSkybox::AppDemoSceneShaderSkybox() + : SLScene("Image Based Lighting Test Scene") +{ + info("Image-based lighting from skybox using high dynamic range images. " + "It uses the Cook-Torrance reflection model also to calculate the " + "ambient light part from the surrounding HDR skybox."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderSkybox::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addSkyboxToLoad(_skybox, + "Desert+X1024_C.jpg", + "Desert-X1024_C.jpg", + "Desert+Y1024_C.jpg", + "Desert-Y1024_C.jpg", + "Desert+Z1024_C.jpg", + "Desert-Z1024_C.jpg"); + al.addProgramToLoad(_spRefl, + AppCommon::shaderPath + "Reflect.vert", + AppCommon::shaderPath + "Reflect.frag"); + al.addProgramToLoad(_spRefr, + AppCommon::shaderPath + "RefractReflect.vert", + AppCommon::shaderPath + "RefractReflect.frag"); + al.addNodeToLoad(_teapot, + AppCommon::modelPath + + "FBX/Teapot/Teapot.fbx"); + al.addNodeToLoad(_suzanne, + AppCommon::modelPath + + "FBX/Suzanne/Suzanne.fbx"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderSkybox::assemble(SLAssetManager* am, SLSceneView* sv) +{ + + // Material for mirror + SLMaterial* refl = new SLMaterial(am, + "refl", + SLCol4f::BLACK, + SLCol4f::WHITE, + 1000, + 1.0f); + refl->addTexture(_skybox->environmentCubemap()); + refl->program(_spRefl); + // Material for glass + SLMaterial* refr = new SLMaterial(am, + "refr", + SLCol4f::BLACK, + SLCol4f::BLACK, + 100, + 0.1f, + 0.9f, + 1.5f); + refr->translucency(1000); + refr->transmissive(SLCol4f::WHITE); + refr->addTexture(_skybox->environmentCubemap()); + refr->program(_spRefr); + + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + // Create camera in the center + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 5); + cam1->setInitialState(); + scene->addChild(cam1); + + // There is no light needed in this scene. All reflections come from cube maps + // But ray tracing needs light sources + // Create directional light for the sunlight + SLLightDirect* light = new SLLightDirect(am, this, 0.5f); + light->ambientColor(SLCol4f(0.3f, 0.3f, 0.3f)); + light->attenuation(1, 0, 0); + light->translate(1, 1, -1); + light->lookAt(-1, -1, 1); + scene->addChild(light); + + // Center sphere + SLNode* sphere = new SLNode(new SLSphere(am, + 0.5f, + 32, + 32, + "Sphere", + refr)); + scene->addChild(sphere); + + // configure teapot + _teapot->translate(-1.5f, -0.5f, 0); + + SLNode* teapot = _teapot->findChild("Teapot"); + teapot->setMeshMat(refl, true); + scene->addChild(_teapot); + + // configure Suzanne + _suzanne->translate(1.5f, -0.5f, 0); + SLNode* suzanne = _suzanne->findChild("Suzanne"); + suzanne->setMeshMat(refr, true); + scene->addChild(_suzanne); + + sv->camera(cam1); + this->skybox(_skybox); + + // Save energy + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.h b/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.h new file mode 100644 index 00000000..9f2ffdb1 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderSkybox.h @@ -0,0 +1,54 @@ +/** + * \file AppDemoSceneShaderSkybox.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERSKYBOX_H +#define APPDEMOSCENESHADERSKYBOX_H + +#include + +//----------------------------------------------------------------------------- +//! Class for image base lighting demo scene +class AppDemoSceneShaderSkybox : public SLScene +{ +public: + AppDemoSceneShaderSkybox(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSkybox* _skybox; + SLGLProgram* _spRefl; + SLGLProgram* _spRefr; + SLNode* _teapot; + SLNode* _suzanne; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderWave.cpp b/apps/app_demo/source/scenes/AppDemoSceneShaderWave.cpp new file mode 100644 index 00000000..3fdf773d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderWave.cpp @@ -0,0 +1,103 @@ +/** + * \file AppDemoSceneShaderWave.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShaderWave::AppDemoSceneShaderWave() + : SLScene("Per vertex wave shader") +{ + + info("Vertex Shader with wave displacement. " + "Use H-Key to increment (decrement w. shift) the wave height."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShaderWave::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "Wave.vert", + AppCommon::shaderPath + "Wave.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShaderWave::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 3, 8); + cam1->lookAt(0, 0, 0); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.4f, 0.8f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Create generic shader program with 4 custom uniforms + SLGLUniform1f* u_h = new SLGLUniform1f(UT_const, + "u_h", + 0.1f, + 0.05f, + 0.0f, + 0.5f, + (SLKey)'H'); + this->eventHandlers().push_back(u_h); + _sp->addUniform1f(u_h); + _sp->addUniform1f(new SLGLUniform1f(UT_inc, "u_t", 0.0f, 0.06f)); + _sp->addUniform1f(new SLGLUniform1f(UT_const, "u_a", 2.5f)); + _sp->addUniform1f(new SLGLUniform1f(UT_incDec, "u_b", 2.2f, 0.01f, 2.0f, 2.5f)); + + // Create materials + SLMaterial* matWater = new SLMaterial(am, + "matWater", + SLCol4f(0.45f, 0.65f, 0.70f), + SLCol4f::WHITE, + 300); + matWater->program(_sp); + SLMaterial* matRed = new SLMaterial(am, + "matRed", + SLCol4f(1.00f, 0.00f, 0.00f)); + + // water rectangle in the y=0 plane + SLNode* wave = new SLNode(new SLRectangle(am, + SLVec2f(-Utils::PI, -Utils::PI), + SLVec2f(Utils::PI, Utils::PI), + 40, + 40, + "WaterRect", + matWater)); + wave->rotate(90, -1, 0, 0); + + SLLightSpot* light0 = new SLLightSpot(am, this); + light0->ambiDiffPowers(0, 1); + light0->translate(0, 4, -4, TS_object); + light0->attenuation(1, 0, 0); + + SLNode* scene = new SLNode; + this->root3D(scene); + scene->addChild(light0); + scene->addChild(wave); + scene->addChild(new SLNode(new SLSphere(am, + 1, + 32, + 32, + "Red Sphere", + matRed))); + scene->addChild(cam1); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShaderWave.h b/apps/app_demo/source/scenes/AppDemoSceneShaderWave.h new file mode 100644 index 00000000..2a3c358d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShaderWave.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneShaderWave.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADERWAVE_H +#define APPDEMOSCENESHADERWAVE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for per vertex wave shader +class AppDemoSceneShaderWave : public SLScene +{ +public: + AppDemoSceneShaderWave(); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLProgram* _sp; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.cpp b/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.cpp new file mode 100644 index 00000000..8e4cbadf --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.cpp @@ -0,0 +1,99 @@ +/** + * \file AppDemoSceneShadowBasic.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShadowBasic::AppDemoSceneShadowBasic() : SLScene("Basic Shadow Mapping Scene") +{ + info("Shadow Mapping is a technique to render shadows in two passes." + "In pass 1 the scene gets rendered from each light source.\n" + "In pass 1 the scene gets rendered from the camera with shadows."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShadowBasic::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShadowBasic::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* matPerPixSM = new SLMaterial(am, "m1"); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 7, 12); + cam1->lookAt(0, 1, 0); + cam1->focalDist(8); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create light source + // Do constant attenuation for directional lights since it is infinitely far away + SLLightDirect* light = new SLLightDirect(am, this); + + light->powers(0.0f, 1.0f, 1.0f); + light->translation(0, 5, 0); + light->lookAt(0, 0, 0); + light->attenuation(1, 0, 0); + light->createsShadows(true); + light->createShadowMap(); + light->shadowMap()->rayCount(SLVec2i(16, 16)); + light->castsShadows(false); + scene->addChild(light); + + // Add a sphere which casts shadows + SLNode* sphereNode = new SLNode(new SLSpheric(am, + 1, + 0, + 180, + 20, + 20, + "Sphere", + matPerPixSM)); + sphereNode->translate(0, 2.0, 0); + sphereNode->castsShadows(true); + scene->addChild(sphereNode); + + SLAnimation* anim = this->animManager().createNodeAnimation("sphere_anim", 2.0f); + anim->createNodeAnimTrackForEllipse(sphereNode, + 0.5f, + A_x, + 0.5f, + A_z); + + // Add a box which receives shadows + SLNode* boxNode = new SLNode(new SLBox(am, + -5, + -1, + -5, + 5, + 0, + 5, + "Box", + matPerPixSM)); + boxNode->castsShadows(false); + scene->addChild(boxNode); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.h b/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.h new file mode 100644 index 00000000..105bed06 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowBasic.h @@ -0,0 +1,46 @@ +/** + * \file AppDemoSceneShadowBasic.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADOWBASIC_H +#define APPDEMOSCENESHADOWBASIC_H + +#include + +//----------------------------------------------------------------------------- +//! Class for basic shadow mapping demo scene +class AppDemoSceneShadowBasic : public SLScene +{ +public: + AppDemoSceneShadowBasic(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.cpp b/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.cpp new file mode 100644 index 00000000..5a181719 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.cpp @@ -0,0 +1,105 @@ +/** + * \file AppDemoSceneShadowCascaded.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShadowCascaded::AppDemoSceneShadowCascaded() + : SLScene("Cascaded Shadow Mapping Demo Scene") +{ + info("Cascaded Shadow Mapping uses several cascades of shadow maps to " + "provide higher resolution shadows near the camera and lower " + "resolution shadows further away."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShadowCascaded::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_teapot, + AppCommon::modelPath+ + "FBX/Teapot/Teapot.fbx"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShadowCascaded::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Setup shadow mapping material + SLMaterial* matPerPixSM = new SLMaterial(am, "m1"); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 7, 12); + cam1->lookAt(0, 1, 0); + cam1->focalDist(8); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create light source + // Do constant attenuation for directional lights since it is infinitely far away + SLLightDirect* light = new SLLightDirect(am, this); + light->powers(0.0f, 1.0f, 1.0f); + light->translation(0, 5, 0); + light->lookAt(0, 0, 0); + light->attenuation(1, 0, 0); + light->createsShadows(true); + light->doCascadedShadows(true); + light->createShadowMapAutoSize(cam1); + light->shadowMap()->rayCount(SLVec2i(16, 16)); + light->castsShadows(false); + scene->addChild(light); + + // Add a sphere which casts shadows + SLNode* sphereNode = new SLNode(new SLSpheric(am, + 1, + 0, + 180, + 20, + 20, + "Sphere", + matPerPixSM)); + sphereNode->translate(0, 2.0, 0); + sphereNode->castsShadows(true); + scene->addChild(sphereNode); + + SLAnimation* anim = this->animManager().createNodeAnimation("sphere_anim", + 2.0f); + anim->createNodeAnimTrackForEllipse(sphereNode, + 0.5f, + A_x, + 0.5f, + A_z); + + // Add a box which receives shadows + SLNode* boxNode = new SLNode(new SLBox(am, + -5, + -1, + -5, + 5, + 0, + 5, + "Box", + matPerPixSM)); + boxNode->castsShadows(false); + scene->addChild(boxNode); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.h b/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.h new file mode 100644 index 00000000..57930042 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowCascaded.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneShadowCascaded.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADOWCASCADED_H +#define APPDEMOSCENESHADOWCASCADED_H + +#include + +//----------------------------------------------------------------------------- +//! Class for demo scene for cascaded shadow mapping +class AppDemoSceneShadowCascaded : public SLScene +{ +public: + AppDemoSceneShadowCascaded(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _teapot; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.cpp b/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.cpp new file mode 100644 index 00000000..401124f0 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.cpp @@ -0,0 +1,170 @@ +/** + * \file AppDemoSceneShadowLightPoint.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch, Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShadowLightPoint::AppDemoSceneShadowLightPoint() + : SLScene("Shadow Mapping for point lights") +{ + info("Point lights use cubemaps to store shadow maps."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShadowLightPoint::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShadowLightPoint::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Setup shadow mapping material + SLMaterial* matPerPixSM = new SLMaterial(am, "m1"); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + // Create camera + SLCamera* cam1 = new SLCamera; + cam1->translation(0, 0, 8); + cam1->lookAt(0, 0, 0); + cam1->fov(27); + cam1->focalDist(cam1->translationOS().length()); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Create lights + SLAnimation* anim = this->animManager().createNodeAnimation("light_anim", 4.0f); + + for (SLint i = 0; i < 3; ++i) + { + SLLightSpot* light = new SLLightSpot(am, this, 0.1f); + light->powers(0.2f, + 1.5f, + 1.0f, + SLCol4f(i == 0, i == 1, i == 2)); + light->attenuation(0, 0, 1); + light->translate((float)i - 1.0f, (float)i - 1.0f, (float)i - 1.0f); + light->createsShadows(true); + light->createShadowMap(); + light->shadowMap()->rayCount(SLVec2i(16, 16)); + scene->addChild(light); + anim->createNodeAnimTrackForEllipse(light, + 0.2f, + A_x, + 0.2f, + A_z); + } + + // Create wall polygons + SLfloat pL = -1.48f, pR = 1.48f; // left/right + SLfloat pB = -1.25f, pT = 1.19f; // bottom/top + SLfloat pN = 1.79f, pF = -1.55f; // near/far + + // Bottom plane + SLNode* b = new SLNode(new SLRectangle(am, + SLVec2f(pL, -pN), + SLVec2f(pR, -pF), + 6, + 6, + "bottom", + matPerPixSM)); + b->rotate(90, -1, 0, 0); + b->translate(0, 0, pB, TS_object); + scene->addChild(b); + + // Top plane + SLNode* t = new SLNode(new SLRectangle(am, + SLVec2f(pL, pF), + SLVec2f(pR, pN), + 6, + 6, + "top", + matPerPixSM)); + t->rotate(90, 1, 0, 0); + t->translate(0, 0, -pT, TS_object); + scene->addChild(t); + + // Far plane + SLNode* f = new SLNode(new SLRectangle(am, + SLVec2f(pL, pB), + SLVec2f(pR, pT), + 6, + 6, + "far", + matPerPixSM)); + f->translate(0, 0, pF, TS_object); + scene->addChild(f); + + // near plane + SLNode* n = new SLNode(new SLRectangle(am, + SLVec2f(pL, pT), + SLVec2f(pR, pB), + 6, + 6, + "near", + matPerPixSM)); + n->translate(0, 0, pN, TS_object); + scene->addChild(n); + + // left plane + SLNode* l = new SLNode(new SLRectangle(am, + SLVec2f(-pN, pB), + SLVec2f(-pF, pT), + 6, + 6, + "left", + matPerPixSM)); + l->rotate(90, 0, 1, 0); + l->translate(0, 0, pL, TS_object); + scene->addChild(l); + + // Right plane + SLNode* r = new SLNode(new SLRectangle(am, + SLVec2f(pF, pB), + SLVec2f(pN, pT), + 6, + 6, + "right", + matPerPixSM)); + r->rotate(90, 0, -1, 0); + r->translate(0, 0, -pR, TS_object); + scene->addChild(r); + + // Create cubes which cast shadows + for (SLint i = 0; i < 64; ++i) + { + SLBox* box = new SLBox(am); + box->mat(matPerPixSM); + SLNode* boxNode = new SLNode(box); + + boxNode->scale(Utils::random(0.01f, 0.1f)); + boxNode->translate(Utils::random(pL + 0.3f, pR - 0.3f), + Utils::random(pB + 0.3f, pT - 0.3f), + Utils::random(pF + 0.3f, pN - 0.3f), + TS_world); + boxNode->castsShadows(true); + + scene->addChild(boxNode); + } + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.h b/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.h new file mode 100644 index 00000000..cd0c4704 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightPoint.h @@ -0,0 +1,46 @@ +/** + * \file AppDemoSceneShadowLightPoint.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADOWLIGHTPOINT_H +#define APPDEMOSCENESHADOWLIGHTPOINT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for demo scene for point light shadow mapping +class AppDemoSceneShadowLightPoint : public SLScene +{ +public: + AppDemoSceneShadowLightPoint(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.cpp b/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.cpp new file mode 100644 index 00000000..3bd802e2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.cpp @@ -0,0 +1,109 @@ +/** + * \file AppDemoSceneShadowLightSpot.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShadowLightSpot::AppDemoSceneShadowLightSpot() + : SLScene("Shadow Mapping for 8 Spot lights") +{ + info("8 Spot lights use a perspective projection for their light space."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShadowLightSpot::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShadowLightSpot::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Setup shadow mapping material + SLMaterial* matPerPixSM = new SLMaterial(am, "m1"); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 5, 13); + cam1->lookAt(0, 1, 0); + cam1->focalDist(8); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create light sources + for (int i = 0; i < SL_MAX_LIGHTS; ++i) + { + SLLightSpot* light = new SLLightSpot(am, + this, + 0.3f, + 45.0f); + SLCol4f color; + color.hsva2rgba(SLVec4f(Utils::TWOPI * (float)i / (float)SL_MAX_LIGHTS, + 1.0f, + 1.0f)); + light->powers(0.0f, 5.0f, 5.0f, color); + light->translation(2 * sin((Utils::TWOPI / (float)SL_MAX_LIGHTS) * (float)i), + 5, + 2 * cos((Utils::TWOPI / (float)SL_MAX_LIGHTS) * (float)i)); + light->lookAt(0, 0, 0); + light->attenuation(0, 0, 1); + light->createsShadows(true); + light->createShadowMap(); + light->shadowMap()->rayCount(SLVec2i(16, 16)); + scene->addChild(light); + } + + // Add a sphere which casts shadows + SLNode* sphereNode = new SLNode(new SLSpheric(am, + 1, + 0, + 180, + 20, + 20, + "Sphere", + matPerPixSM)); + sphereNode->translate(0, 2.0, 0); + sphereNode->castsShadows(true); + scene->addChild(sphereNode); + + SLAnimation* anim = this->animManager().createNodeAnimation("sphere_anim", + 2.0f); + anim->createNodeAnimTrackForEllipse(sphereNode, + 1.0f, + A_x, + 1.0f, + A_z); + + // Add a box which receives shadows + SLNode* boxNode = new SLNode(new SLBox(am, + -5, + -1, + -5, + 5, + 0, + 5, + "Box", + matPerPixSM)); + boxNode->castsShadows(false); + scene->addChild(boxNode); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.h b/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.h new file mode 100644 index 00000000..253226d9 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightSpot.h @@ -0,0 +1,46 @@ +/** + * \file AppDemoSceneShadowLightSpot.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADOWLIGHTSPOT_H +#define APPDEMOSCENESHADOWLIGHTSPOT_H + +#include + +//----------------------------------------------------------------------------- +//! Class for demo scene for spotlight shadow mapping +class AppDemoSceneShadowLightSpot : public SLScene +{ +public: + AppDemoSceneShadowLightSpot(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.cpp b/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.cpp new file mode 100644 index 00000000..23c7241d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.cpp @@ -0,0 +1,133 @@ +/** + * \file AppDemoSceneShadowLightTypes.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneShadowLightTypes::AppDemoSceneShadowLightTypes() + : SLScene("Shadow Mapping Types Demo Scene") +{ + info("Shadow Mapping is implemented for these light types."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneShadowLightTypes::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_teapot, + AppCommon::modelPath + + "FBX/Teapot/Teapot.fbx"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneShadowLightTypes::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* mat1 = new SLMaterial(am, "mat1"); + + // Base root group node for the scene + SLNode* scene = new SLNode; + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 20); + cam1->lookAt(0, 2, 0); + cam1->focalDist(20); + cam1->background().colors(SLCol4f(0.1f, 0.1f, 0.1f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create light sources + vector lights = { + new SLLightDirect(am, this), + new SLLightRect(am, this), + new SLLightSpot(am, this, 0.3f, 25.0f), + new SLLightSpot(am, this, 0.1f, 180.0f)}; + + for (SLint i = 0; i < lights.size(); ++i) + { + SLLight* light = lights[i]; + SLNode* node = dynamic_cast(light); + SLfloat x = ((float)i - ((SLfloat)lights.size() - 1.0f) / 2.0f) * 5; + + if (i == 0) // Make direct light less bright + { + light->powers(0.0f, 0.4f, 0.4f); + light->attenuation(1, 0, 0); + } + else + { + light->powers(0.0f, 2.0f, 2.0f); + light->attenuation(0, 0, 1); + } + + node->translation(x, 5, 0); + node->lookAt(x, 0, 0); + light->createsShadows(true); + light->createShadowMap(); + light->shadowMap()->rayCount(SLVec2i(16, 16)); + scene->addChild(node); + } + + SLAnimation* teapotAnim = this->animManager().createNodeAnimation("teapot_anim", + 8.0f, + true, + EC_linear, + AL_loop); + + for (SLLight* light : lights) + { + SLNode* teapot = _teapot->copyRec(); + + teapot->translate(light->positionWS().x, 2, 0); + teapot->children()[0]->castsShadows(true); + scene->addChild(teapot); + + // Create animation + SLNodeAnimTrack* track = teapotAnim->createNodeAnimTrack(); + track->animatedNode(teapot); + + SLTransformKeyframe* frame0 = track->createNodeKeyframe(0.0f); + frame0->translation(teapot->translationWS()); + frame0->rotation(SLQuat4f(0, 0, 0)); + + SLTransformKeyframe* frame1 = track->createNodeKeyframe(4.0f); + frame1->translation(teapot->translationWS()); + frame1->rotation(SLQuat4f(0, 1 * PI, 0)); + + SLTransformKeyframe* frame2 = track->createNodeKeyframe(8.0f); + frame2->translation(teapot->translationWS()); + frame2->rotation(SLQuat4f(0, 2 * PI, 0)); + } + + // Add a box which receives shadows + SLfloat minx = lights.front()->positionWS().x - 3; + SLfloat maxx = lights.back()->positionWS().x + 3; + SLNode* boxNode = new SLNode(new SLBox(am, + minx, + -1, + -5, + maxx, + 0, + 5, + "Box", + mat1)); + boxNode->castsShadows(false); + scene->addChild(boxNode); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.h b/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.h new file mode 100644 index 00000000..c249b28b --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneShadowLightTypes.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneShadowLightTypes.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESHADOWLIGHTTYPES_H +#define APPDEMOSCENESHADOWLIGHTTYPES_H + +#include + +//----------------------------------------------------------------------------- +//! Class for demo scene with all shadow mapping types +class AppDemoSceneShadowLightTypes : public SLScene +{ +public: + AppDemoSceneShadowLightTypes(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _teapot; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneSuzanne.cpp b/apps/app_demo/source/scenes/AppDemoSceneSuzanne.cpp new file mode 100644 index 00000000..bc7b45fd --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneSuzanne.cpp @@ -0,0 +1,150 @@ +/** + * \file AppDemoSceneSuzanne.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneSuzanne::AppDemoSceneSuzanne(SLstring name, + bool textureMapping, + bool normalMapping, + bool occlusionMapping, + bool shadowMapping, + bool environmentMapping) + : SLScene(name), + _textureMapping(textureMapping), + _normalMapping(normalMapping), + _occlusionMapping(occlusionMapping), + _shadowMapping(shadowMapping), + _environmentMapping(environmentMapping) +{ + info(name); + _skybox = nullptr; +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneSuzanne::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_suzanneInCube, + AppCommon::modelPath + + "GLTF/AO-Baked-Test/AO-Baked-Test.gltf", + nullptr, + false, // delete tex images after build + true, // load meshes only + nullptr, // override material + 0.5f); + + if (_environmentMapping) + al.addSkyboxToLoad(_skybox, + al.modelPath() + + "GLTF/glTF-Sample-Models/hdris/envmap_malibu.hdr", + SLVec2i(256, 256), + "HDR Skybox"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneSuzanne::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create camera in the center + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0.5f, 2); + cam1->lookAt(0, 0.5f, 0); + cam1->setInitialState(); + cam1->focalDist(2); + scene->addChild(cam1); + + // Create directional light for the sunlight + SLLightDirect* light; + + if (_environmentMapping) + { + SLLight::gamma = 2.0f; + light = new SLLightDirect(am, this, 0.1f); + light->ambientPower(0.0f); + light->diffusePower(3.0f); + light->attenuation(1, 0, 0); + light->translate(0, 0, 0.5); + light->lookAt(-1, -1, -1); + } + else + { + SLLight::gamma = 1.0f; + light = new SLLightDirect(am, this, 0.1f); + light->ambientPower(0.6f); + light->diffusePower(0.6f); + light->attenuation(1, 0, 0); + light->translate(0, 0, 0.5); + light->lookAt(1, -1, 0.5); + SLAnimation* lightAnim = animManager().createNodeAnimation("LightAnim", + 4.0f, + true, + EC_inOutSine, + AL_pingPongLoop); + lightAnim->createNodeAnimTrackForRotation(light, + -180, + SLVec3f(0, 1, 0)); + } + scene->addChild(light); + + // Add shadow mapping + if (_shadowMapping) + { + light->createsShadows(true); + light->createShadowMap(-3, + 3, + SLVec2f(5, 5), + SLVec2i(2048, 2048)); + light->doSmoothShadows(true); + } + + SLCol4f stoneColor(0.56f, 0.50f, 0.44f); + + // Remove unwanted textures + auto materialUpdater = [=](SLMaterial* mat) + { + if (!_textureMapping) + { + mat->removeTextureType(TT_diffuse); + mat->ambientDiffuse(stoneColor); + } + + if (!_normalMapping) + mat->removeTextureType(TT_normal); + + if (!_occlusionMapping) + mat->removeTextureType(TT_occlusion); + + if (_environmentMapping) + { + mat->skybox(_skybox); + mat->reflectionModel(RM_CookTorrance); + this->skybox(_skybox); + } + }; + _suzanneInCube->updateMeshMat(materialUpdater, + true); + + scene->addChild(_suzanneInCube); + + sv->camera(cam1); + + // Save energy + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneSuzanne.h b/apps/app_demo/source/scenes/AppDemoSceneSuzanne.h new file mode 100644 index 00000000..363121f2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneSuzanne.h @@ -0,0 +1,61 @@ +/** + * \file AppDemoSceneSuzanne.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENESUZANNE_H +#define APPDEMOSCENESUZANNE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for all variants of generated shaders on the Suzanne head +class AppDemoSceneSuzanne : public SLScene +{ +public: + AppDemoSceneSuzanne(SLstring name, + bool textureMapping, + bool normalMapping, + bool occlusionMapping, + bool shadowMapping, + bool environmentMapping); + + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLNode* _suzanneInCube; + SLSkybox* _skybox; + bool _textureMapping; + bool _normalMapping; + bool _occlusionMapping; + bool _shadowMapping; + bool _environmentMapping; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.cpp b/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.cpp new file mode 100644 index 00000000..3ffc3df0 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.cpp @@ -0,0 +1,166 @@ +/** + * \file AppDemoSceneTextureBlend.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marcus Hudritsch + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneTextureBlend::AppDemoSceneTextureBlend() : SLScene("Texture Blending Test") +{ + info("Texture map blending with depth sorting. Transparent tree rectangles in view " + "frustum are rendered back to front. You can turn on/off alpha sorting in the " + "menu Preferences of press key J."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneTextureBlend::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_t1, + AppCommon::texturePath + + "tree1_1024_C.png", + GL_LINEAR_MIPMAP_LINEAR, + GL_LINEAR, + TT_diffuse, + GL_CLAMP_TO_EDGE, + GL_CLAMP_TO_EDGE); + al.addTextureToLoad(_t2, + AppCommon::texturePath + + "grass0512_C.jpg", + GL_LINEAR_MIPMAP_LINEAR, + GL_LINEAR); + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "PerVrtTm.vert", + AppCommon::shaderPath + "PerVrtTm.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneTextureBlend::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLMaterial* m1 = new SLMaterial(am, + "m1", + SLCol4f(1, 1, 1), + SLCol4f(0, 0, 0), + 100); + SLMaterial* m2 = new SLMaterial(am, + "m2", + SLCol4f(1, 1, 1), + SLCol4f(0, 0, 0), + 100); + m1->program(_sp); + m1->addTexture(_t1); + m2->addTexture(_t2); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(6.5f, 0.5f, -18); + cam1->lookAt(0, 0, 0); + cam1->focalDist(18); + cam1->background().colors(SLCol4f(0.6f, 0.6f, 1)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLLightSpot* light = new SLLightSpot(am, this, 0.1f); + light->translation(5, 5, 5); + light->lookAt(0, 0, 0); + light->attenuation(1, 0, 0); + + // Build arrays for polygon vertices and texture coordinates for tree + SLVVec3f pNW, pSE; + SLVVec2f tNW, tSE; + pNW.push_back(SLVec3f(0, 0, 0)); + tNW.push_back(SLVec2f(0.5f, 0.0f)); + pNW.push_back(SLVec3f(1, 0, 0)); + tNW.push_back(SLVec2f(1.0f, 0.0f)); + pNW.push_back(SLVec3f(1, 2, 0)); + tNW.push_back(SLVec2f(1.0f, 1.0f)); + pNW.push_back(SLVec3f(0, 2, 0)); + tNW.push_back(SLVec2f(0.5f, 1.0f)); + pSE.push_back(SLVec3f(-1, 0, 0)); + tSE.push_back(SLVec2f(0.0f, 0.0f)); + pSE.push_back(SLVec3f(0, 0, 0)); + tSE.push_back(SLVec2f(0.5f, 0.0f)); + pSE.push_back(SLVec3f(0, 2, 0)); + tSE.push_back(SLVec2f(0.5f, 1.0f)); + pSE.push_back(SLVec3f(-1, 2, 0)); + tSE.push_back(SLVec2f(0.0f, 1.0f)); + + // Build tree out of 4 polygons + SLNode* p1 = new SLNode(new SLPolygon(am, pNW, tNW, "Tree+X", m1)); + SLNode* p2 = new SLNode(new SLPolygon(am, pNW, tNW, "Tree-Z", m1)); + p2->rotate(90, 0, 1, 0); + SLNode* p3 = new SLNode(new SLPolygon(am, pSE, tSE, "Tree-X", m1)); + SLNode* p4 = new SLNode(new SLPolygon(am, pSE, tSE, "Tree+Z", m1)); + p4->rotate(90, 0, 1, 0); + + // Turn face culling off so that we see both sides + p1->drawBits()->on(SL_DB_CULLOFF); + p2->drawBits()->on(SL_DB_CULLOFF); + p3->drawBits()->on(SL_DB_CULLOFF); + p4->drawBits()->on(SL_DB_CULLOFF); + + // Build tree group + SLNode* tree = new SLNode("grTree"); + tree->addChild(p1); + tree->addChild(p2); + tree->addChild(p3); + tree->addChild(p4); + + // Build arrays for polygon vertices and texcoords for ground + SLVVec3f pG; + SLVVec2f tG; + SLfloat size = 22.0f; + pG.push_back(SLVec3f(-size, 0, size)); + tG.push_back(SLVec2f(0, 0)); + pG.push_back(SLVec3f(size, 0, size)); + tG.push_back(SLVec2f(30, 0)); + pG.push_back(SLVec3f(size, 0, -size)); + tG.push_back(SLVec2f(30, 30)); + pG.push_back(SLVec3f(-size, 0, -size)); + tG.push_back(SLVec2f(0, 30)); + + SLNode* scene = new SLNode("grScene"); + this->root3D(scene); + scene->addChild(light); + scene->addChild(tree); + scene->addChild(new SLNode(new SLPolygon(am, + pG, + tG, + "Ground", + m2))); + + // create 21*21*21-1 references around the center tree + SLint res = 10; + for (SLint iZ = -res; iZ <= res; ++iZ) + { + for (SLint iX = -res; iX <= res; ++iX) + { + if (iX != 0 || iZ != 0) + { + SLNode* t = tree->copyRec(); + t->translate(float(iX) * 2 + Utils::random(0.7f, 1.4f), + 0, + float(iZ) * 2 + Utils::random(0.7f, 1.4f), + TS_object); + t->rotate(Utils::random(0.f, 90.f), 0, 1, 0); + t->scale(Utils::random(0.5f, 1.0f)); + scene->addChild(t); + } + } + } + + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.h b/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.h new file mode 100644 index 00000000..32a309a2 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureBlend.h @@ -0,0 +1,51 @@ +/** + * \file AppDemoSceneTextureBlend.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENETEXTUREBLEND_H +#define APPDEMOSCENETEXTUREBLEND_H + +#include + +//----------------------------------------------------------------------------- +//! Class for texture blending scene +class AppDemoSceneTextureBlend : public SLScene +{ +public: + AppDemoSceneTextureBlend(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _t1; + SLGLTexture* _t2; + SLGLProgram* _sp; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.cpp b/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.cpp new file mode 100644 index 00000000..54e3e147 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.cpp @@ -0,0 +1,276 @@ +/** + * \file AppDemoSceneTextureCompression.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneTextureCompression::AppDemoSceneTextureCompression() : SLScene("Texture Compression Test Scene") +{ + info("Texture Compression Test Scene"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneTextureCompression::registerAssetsToLoad(SLAssetLoader& al) +{ + SLint min = GL_LINEAR_MIPMAP_LINEAR; + SLint mag = GL_LINEAR; + + al.addTextureToLoad(_texPng, + AppCommon::texturePath + + "earth2048_C.png", + min, + mag); + al.addTextureToLoad(_texJpgQ90, + AppCommon::texturePath + + "earth2048_C_Q90.jpg", + min, + mag); + al.addTextureToLoad(_texJpgQ40, + AppCommon::texturePath + + "earth2048_C_Q40.jpg", + min, + mag); + + /* Console commands to generate the following KTX files + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --bcmp --clevel 4 --qlevel 255 earth2048_C_bcmp_Q255.ktx2 earth2048_C.png + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --bcmp --clevel 4 --qlevel 128 earth2048_C_bcmp_Q128.ktx2 earth2048_C.png + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --bcmp --clevel 4 --qlevel 1 earth2048_C_bcmp_Q001.ktx2 earth2048_C.png + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --uastc 4 --zcmp 19 earth2048_C_uastc4.ktx2 earth2048_C.png + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --uastc 2 --zcmp 19 earth2048_C_uastc2.ktx2 earth2048_C.png + ./../../../externals/prebuilt/mac64_ktx_v4.0.0-beta7-cpvr/release/toktx --automipmap --linear --lower_left_maps_to_s0t0 --uastc 0 --zcmp 19 earth2048_C_uastc0.ktx2 earth2048_C.png + */ + + al.addTextureToLoad(_texKtxBcmp255, + AppCommon::texturePath + + "earth2048_C_bcmp_Q255.ktx2", + min, + mag); + al.addTextureToLoad(_texKtxBcmp128, + AppCommon::texturePath + + "earth2048_C_bcmp_Q128.ktx2", + min, + mag); + al.addTextureToLoad(_texKtxBcmp001, + AppCommon::texturePath + + "earth2048_C_bcmp_Q001.ktx2", + min, + mag); + al.addTextureToLoad(_texKtxUastc4, + AppCommon::texturePath + + "earth2048_C_uastc4.ktx2", + min, + mag); + al.addTextureToLoad(_texKtxUastc2, + AppCommon::texturePath + + "earth2048_C_uastc2.ktx2", + min, + mag); + al.addTextureToLoad(_texKtxUastc0, + AppCommon::texturePath + + "earth2048_C_uastc0.ktx2", + min, + mag); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneTextureCompression::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.1f); + light1->translation(5, 5, 5); + light1->name("light node"); + scene->addChild(light1); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.1f); + cam1->clipFar(100); + cam1->translation(0, 0, 4.2f); + cam1->lookAt(0, 0, 0); + cam1->focalDist(4.2f); + cam1->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Position for rectangle and uv out of earth texture + SLVec2f pMin(-.5f, -.5f), pMax(.5f, .5f); + SLVec2f tMin(.47f, .69f), tMax(.56f, .81f); + + //......................................................................... + SLMaterial* matPng = new SLMaterial(am, + "matPng", + _texPng); + SLMesh* rectMeshPng = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshPng", + matPng); + SLNode* rectNodePng = new SLNode(rectMeshPng, + "rectNodePng"); + rectNodePng->translate(-1.05f, 1.05f, 0); + scene->addChild(rectNodePng); + //......................................................................... + SLMaterial* matJpgQ90 = new SLMaterial(am, + "matJpgQ90", + _texJpgQ90); + SLMesh* rectMeshJpgQ90 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshJpgQ90", + matJpgQ90); + SLNode* rectNodeJpgQ90 = new SLNode(rectMeshJpgQ90, + "rectNodeJpgQ90"); + rectNodeJpgQ90->translate(0, 1.05f, 0); + scene->addChild(rectNodeJpgQ90); + //......................................................................... + SLMaterial* matJpgQ40 = new SLMaterial(am, + "matJpgQ40", + _texJpgQ40); + SLMesh* rectMeshJpgQ40 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshJpgQ40", + matJpgQ40); + SLNode* rectNodeJpgQ40 = new SLNode(rectMeshJpgQ40, + "rectNodeJpgQ40"); + rectNodeJpgQ40->translate(1.05f, 1.05f, 0); + scene->addChild(rectNodeJpgQ40); + //......................................................................... + SLMaterial* matKtxBcmp255 = new SLMaterial(am, + "matKtxBcmp255", + _texKtxBcmp255); + SLMesh* rectMeshKtxBcmp255 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxBcmp255", + matKtxBcmp255); + SLNode* rectNodeKtxBcmp255 = new SLNode(rectMeshKtxBcmp255, + "rectNodeKtxBcmp255"); + rectNodeKtxBcmp255->translate(-1.05f, 0, 0); + scene->addChild(rectNodeKtxBcmp255); + //......................................................................... + SLMaterial* matKtxBcmp128 = new SLMaterial(am, + "matKtxBcmp128", + _texKtxBcmp128); + SLMesh* rectMeshKtxBcmp128 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxBcmp128", + matKtxBcmp128); + SLNode* rectNodeKtxBcmp128 = new SLNode(rectMeshKtxBcmp128, + "rectNodeKtxBcmp128"); + rectNodeKtxBcmp128->translate(0, 0, 0); + scene->addChild(rectNodeKtxBcmp128); + //......................................................................... + SLMaterial* matKtxBcmp001 = new SLMaterial(am, + "matKtxBcmp001", + _texKtxBcmp001); + SLMesh* rectMeshKtxBcmp001 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxBcmp001", + matKtxBcmp001); + SLNode* rectNodeKtxBcmp001 = new SLNode(rectMeshKtxBcmp001, + "rectNodeKtxBcmp001"); + rectNodeKtxBcmp001->translate(1.05f, 0, 0); + scene->addChild(rectNodeKtxBcmp001); + //......................................................................... + SLMaterial* matKtxUastc4 = new SLMaterial(am, + "matKtxUastc4", + _texKtxUastc4); + SLMesh* rectMeshKtxUastc4 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxUastc4", + matKtxUastc4); + SLNode* rectNodeKtxUastc4 = new SLNode(rectMeshKtxUastc4, + "rectNodeKtxUastc4"); + rectNodeKtxUastc4->translate(1.05f, -1.05f, 0); + scene->addChild(rectNodeKtxUastc4); + //......................................................................... + SLMaterial* matKtxUastc2 = new SLMaterial(am, + "matKtxUastc2", + _texKtxUastc2); + SLMesh* rectMeshKtxUastc2 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxUastc2", + matKtxUastc2); + SLNode* rectNodeKtxUastc2 = new SLNode(rectMeshKtxUastc2, + "rectNodeKtxUastc2"); + rectNodeKtxUastc2->translate(0, -1.05f, 0); + scene->addChild(rectNodeKtxUastc2); + //......................................................................... + SLMaterial* matKtxUastc0 = new SLMaterial(am, + "matKtxUastc0", + _texKtxUastc0); + SLMesh* rectMeshKtxUastc0 = new SLRectangle(am, + pMin, + pMax, + tMin, + tMax, + 1, + 1, + "rectMeshKtxUastc0", + matKtxUastc0); + SLNode* rectNodeKtxUastc0 = new SLNode(rectMeshKtxUastc0, + "rectNodeKtxUastc0"); + rectNodeKtxUastc0->translate(-1.05f, -1.05f, 0); + scene->addChild(rectNodeKtxUastc0); + //......................................................................... + // Add active camera + sv->camera(cam1); + + // Save energy + sv->doWaitOnIdle(true); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.h b/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.h new file mode 100644 index 00000000..ac7a87f6 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureCompression.h @@ -0,0 +1,57 @@ +/** + * \file AppDemoSceneTextureCompression.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENETEXTURECOMPRESSION_H +#define APPDEMOSCENETEXTURECOMPRESSION_H + +#include + +//----------------------------------------------------------------------------- +//! Class for texture compression test scene +class AppDemoSceneTextureCompression : public SLScene +{ +public: + AppDemoSceneTextureCompression(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texPng; + SLGLTexture* _texJpgQ90; + SLGLTexture* _texJpgQ40; + SLGLTexture* _texKtxBcmp255; + SLGLTexture* _texKtxBcmp128; + SLGLTexture* _texKtxBcmp001; + SLGLTexture* _texKtxUastc4; + SLGLTexture* _texKtxUastc2; + SLGLTexture* _texKtxUastc0; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.cpp b/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.cpp new file mode 100644 index 00000000..170c3949 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.cpp @@ -0,0 +1,174 @@ +/** + * \file AppDemoSceneTextureFilter.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneTextureFilter::AppDemoSceneTextureFilter() : SLScene("Texture Filter Test") +{ + info("Texture minification filters: " + "Bottom: nearest, left: linear, top: linear mipmap, right: anisotropic. " + "The center sphere uses a 3D texture with linear filtering."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneTextureFilter::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addTextureToLoad(_texB, + AppCommon::texturePath + + "brick0512_C.png", + GL_LINEAR, + GL_LINEAR); + al.addTextureToLoad(_texL, + AppCommon::texturePath + + "brick0512_C.png", + GL_NEAREST, + GL_NEAREST); + al.addTextureToLoad(_texT, + AppCommon::texturePath + + "brick0512_C.png", + GL_LINEAR_MIPMAP_LINEAR, + GL_LINEAR); + al.addTextureToLoad(_texR, + AppCommon::texturePath + + "brick0512_C.png", + SL_ANISOTROPY_MAX, + GL_LINEAR); + al.addTextureToLoad(_tex3D, + 256, + AppCommon::texturePath + + "Wave_radial10_256C.jpg"); + al.addProgramToLoad(_spr3D, + AppCommon::shaderPath + "TextureOnly3D.vert", + AppCommon::shaderPath + "TextureOnly3D.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneTextureFilter::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // define materials with textureOnly shader, no light needed + SLMaterial* matB = new SLMaterial(am, + "matB", + _texB, + nullptr, + nullptr, + nullptr, + SLGLProgramManager::get(SP_textureOnly)); + SLMaterial* matL = new SLMaterial(am, + "matL", + _texL, + nullptr, + nullptr, + nullptr, + SLGLProgramManager::get(SP_textureOnly)); + SLMaterial* matT = new SLMaterial(am, + "matT", + _texT, + nullptr, + nullptr, + nullptr, + SLGLProgramManager::get(SP_textureOnly)); + SLMaterial* matR = new SLMaterial(am, + "matR", + _texR, + nullptr, + nullptr, + nullptr, + SLGLProgramManager::get(SP_textureOnly)); + + // build polygons for bottom, left, top & right side + SLVVec3f VB; + VB.push_back(SLVec3f(-0.5f, -0.5f, 1.0f)); + VB.push_back(SLVec3f(0.5f, -0.5f, 1.0f)); + VB.push_back(SLVec3f(0.5f, -0.5f, -2.0f)); + VB.push_back(SLVec3f(-0.5f, -0.5f, -2.0f)); + SLVVec2f T; + T.push_back(SLVec2f(0.0f, 2.0f)); + T.push_back(SLVec2f(0.0f, 0.0f)); + T.push_back(SLVec2f(6.0f, 0.0f)); + T.push_back(SLVec2f(6.0f, 2.0f)); + SLNode* polyB = new SLNode(new SLPolygon(am, VB, T, "PolygonB", matB)); + + SLVVec3f VL; + VL.push_back(SLVec3f(-0.5f, 0.5f, 1.0f)); + VL.push_back(SLVec3f(-0.5f, -0.5f, 1.0f)); + VL.push_back(SLVec3f(-0.5f, -0.5f, -2.0f)); + VL.push_back(SLVec3f(-0.5f, 0.5f, -2.0f)); + SLNode* polyL = new SLNode(new SLPolygon(am, VL, T, "PolygonL", matL)); + + SLVVec3f VT; + VT.push_back(SLVec3f(0.5f, 0.5f, 1.0f)); + VT.push_back(SLVec3f(-0.5f, 0.5f, 1.0f)); + VT.push_back(SLVec3f(-0.5f, 0.5f, -2.0f)); + VT.push_back(SLVec3f(0.5f, 0.5f, -2.0f)); + SLNode* polyT = new SLNode(new SLPolygon(am, VT, T, "PolygonT", matT)); + + SLVVec3f VR; + VR.push_back(SLVec3f(0.5f, -0.5f, 1.0f)); + VR.push_back(SLVec3f(0.5f, 0.5f, 1.0f)); + VR.push_back(SLVec3f(0.5f, 0.5f, -2.0f)); + VR.push_back(SLVec3f(0.5f, -0.5f, -2.0f)); + SLNode* polyR = new SLNode(new SLPolygon(am, VR, T, "PolygonR", matR)); + + // 3D Texture Mapping on a pyramid + SLMaterial* mat3D = new SLMaterial(am, + "mat3D", + _tex3D, + nullptr, + nullptr, + nullptr, + _spr3D); + + // Create 3D textured pyramid mesh and node + SLMesh* pyramid = new SLMesh(am, "Pyramid"); + pyramid->mat(mat3D); + pyramid->P = {{-1, -1, 1}, + {1, -1, 1}, + {1, -1, -1}, + {-1, -1, -1}, + {0, 2, 0}}; + pyramid->I16 = {0, 3, 1, 1, 3, 2, 4, 0, 1, 4, 1, 2, 4, 2, 3, 4, 3, 0}; + SLNode* pyramidNode = new SLNode(pyramid, "Pyramid"); + pyramidNode->scale(0.2f); + pyramidNode->translate(0, 0, -3); + + // Create 3D textured sphere mesh and node + SLNode* sphere = new SLNode(new SLSphere(am, + 0.2f, + 16, + 16, + "Sphere", + mat3D)); + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 2.6f); + cam1->lookAt(0, 0, 0); + cam1->focalDist(2.2f); + cam1->background().colors(SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + SLNode* scene = new SLNode(); + this->root3D(scene); + scene->addChild(polyB); + scene->addChild(polyL); + scene->addChild(polyT); + scene->addChild(polyR); + scene->addChild(sphere); + scene->addChild(cam1); + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.h b/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.h new file mode 100644 index 00000000..cbd02ee6 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneTextureFilter.h @@ -0,0 +1,54 @@ +/** + * \file AppDemoSceneTextureFilter.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENETEXTUREFILTER_H +#define APPDEMOSCENETEXTUREFILTER_H + +#include + +//----------------------------------------------------------------------------- +//! Class for texture filtering scene +class AppDemoSceneTextureFilter : public SLScene +{ +public: + AppDemoSceneTextureFilter(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _texB; + SLGLTexture* _texL; + SLGLTexture* _texT; + SLGLTexture* _texR; + SLGLTexture* _tex3D; + SLGLProgram* _spr3D; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.cpp new file mode 100644 index 00000000..0880fd21 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.cpp @@ -0,0 +1,113 @@ +/** + * \file AppDemoSceneVideoSensorAR.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoSensorAR::AppDemoSceneVideoSensorAR() + : SLScene("Video Sensor AR") +{ + info("Minimal scene to test the devices IMU and GPS Sensors. " + "See the sensor information. GPS needs a few sec. to improve the accuracy."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoSensorAR::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoSensorAR::assemble(SLAssetManager* am, SLSceneView* sv) +{ + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 60); + cam1->lookAt(0, 0, 0); + cam1->fov(CVCapture::instance()->activeCamera->calibration.cameraFovVDeg()); + cam1->clipNear(0.1f); + cam1->clipFar(10000.0f); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + // Turn on main video + CVCapture::instance()->videoType(VT_MAIN); + + // Create directional light for the sunlight + SLLightDirect* light = new SLLightDirect(am, this, 1.0f); + light->powers(1.0f, 1.0f, 1.0f); + light->attenuation(1, 0, 0); + + // Let the sun be rotated by time and location + AppCommon::devLoc.sunLightNode(light); + + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->scale(2); + axis->rotate(-90, 1, 0, 0); + + // Yellow center box + SLMaterial* yellow = new SLMaterial(am, + "mY", + SLCol4f(1, 1, 0, 0.5f)); + SLNode* box = new SLNode(new SLBox( + am, + -.5f, + -.5f, + -.5f, + .5f, + .5f, + .5f, + "Box", + yellow), + "Box Node"); + + // Scene structure + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light); + scene->addChild(cam1); + scene->addChild(box); + scene->addChild(axis); + + sv->camera(cam1); + +#if defined(SL_OS_MACIOS) || defined(SL_OS_ANDROID) + // activate rotation and gps sensor + AppCommon::devRot.isUsed(true); + AppCommon::devRot.zeroYawAtStart(false); + AppCommon::devLoc.isUsed(true); + AppCommon::devLoc.useOriginAltitude(true); + AppCommon::devLoc.hasOrigin(false); + cam1->camAnim(SLCamAnim::CA_deviceRotLocYUp); +#else + cam1->camAnim(SLCamAnim::CA_turntableYUp); + AppCommon::devRot.zeroYawAtStart(true); +#endif + + sv->doWaitOnIdle(false); // for constant video feed +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.h b/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.h new file mode 100644 index 00000000..7b5731c3 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoSensorAR.h @@ -0,0 +1,48 @@ +/** + * \file AppDemoSceneVideoSensorAR.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOSENSORAR_H +#define APPDEMOSCENEVIDEOSENSORAR_H + +#include + +//----------------------------------------------------------------------------- +//! Class for devices IMU and GPS Sensors test scene. +class AppDemoSceneVideoSensorAR : public SLScene +{ +public: + AppDemoSceneVideoSensorAR(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.cpp new file mode 100644 index 00000000..e94a0ee8 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.cpp @@ -0,0 +1,117 @@ +/** + * \file AppDemoSceneVideoTexture.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTexture::AppDemoSceneVideoTexture(SLSceneID sid) + : SLScene("Texture from Video"), + _sceneID(sid) +{ + // Set scene name and info string + if (_sceneID == SID_VideoTextureLive) + info("Minimal texture mapping example with live video source."); + else + info("Minimal texture mapping example with video file source."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTexture::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTexture::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Set scene name and info string + if (_sceneID == SID_VideoTextureLive) + { + // on desktop, it will be the main camera + CVCapture::instance()->videoType(VT_MAIN); + } + else + { + CVCapture::instance()->videoType(VT_FILE); + CVCapture::instance()->videoFilename = AppCommon::videoPath + "street3.mp4"; + CVCapture::instance()->videoLoops = true; + } + + sv->viewportSameAsVideo(true); + + // Create video texture on global pointer updated in AppDemoVideo + SLMaterial* m1 = new SLMaterial(am, "VideoMat", gVideoTexture); + + // Create a root scene group for all nodes + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create a camera node + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 20); + cam1->focalDist(20); + cam1->lookAt(0, 0, 0); + cam1->background().texture(gVideoTexture); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + // Create rectangle meshe and nodes + SLfloat h = 5.0f; + SLfloat w = h * sv->viewportWdivH(); + SLMesh* rectMesh = new SLRectangle(am, + SLVec2f(-w, -h), + SLVec2f(w, h), + 1, + 1, + "rect mesh", + m1); + SLNode* rectNode = new SLNode(rectMesh, "rect node"); + rectNode->translation(0, 0, -5); + scene->addChild(rectNode); + + // Center sphere + SLNode* sphere = new SLNode(new SLSphere(am, + 2, + 32, + 32, + "Sphere", + m1)); + sphere->rotate(-90, 1, 0, 0); + scene->addChild(sphere); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->translation(0, 0, 5); + light1->lookAt(0, 0, 0); + light1->name("light node"); + scene->addChild(light1); + + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.h new file mode 100644 index 00000000..48861440 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTexture.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneVideoTexture.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOTEX_H +#define APPDEMOSCENEVIDEOTEX_H + +#include + +//----------------------------------------------------------------------------- +//! Class for live or file video on texture test scene +class AppDemoSceneVideoTexture : public SLScene +{ +public: + AppDemoSceneVideoTexture(SLSceneID sid); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.cpp new file mode 100644 index 00000000..32b4a4ef --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.cpp @@ -0,0 +1,145 @@ +/** + * \file AppDemoSceneVideoTrackAruco.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackAruco::AppDemoSceneVideoTrackAruco(SLSceneID sid) + : SLScene("Aruco Marker Tracking"), + _sceneID(sid) +{ + if (_sceneID == SID_VideoTrackArucoMain) + { + name("Track Aruco (main cam.)"); + info("Hold the Aruco board dictionary 0 into the field of view of " + "the main camera. You can find the Aruco markers in the file " + "data/Calibrations. If not all markers are tracked you may have " + "the mirror the video horizontally."); + } + else + { + name("Track Aruco (scnd. cam.)"); + info("Hold the Aruco board dictionary 0 into the field of view of " + "the secondary camera. You can find the Aruco markers in the file " + "data/Calibrations. If not all markers are tracked you may have " + "the mirror the video horizontally."); + } +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackAruco::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + // Create an ArUco tracker + al.addLoadTask([]() + { + gVideoTracker = new CVTrackedAruco(9, AppCommon::calibIniPath); + gVideoTracker->drawDetection(true); }); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackAruco::assemble(SLAssetManager* am, SLSceneView* sv) +{ + /* + The tracking of markers is done in AppDemoVideo::onUpdateVideo by calling + the specific CVTracked::track method. If a marker was found it overwrites + the linked nodes object matrix (SLNode::_om). If the linked node is the + active camera the found transform is additionally inversed. + This would be the standard augmented reality use case. + */ + + if (_sceneID == SID_VideoTrackArucoMain) + CVCapture::instance()->videoType(VT_MAIN); + else + CVCapture::instance()->videoType(VT_SCND); + + // Material + SLMaterial* yellow = new SLMaterial(am, + "mY", + SLCol4f(1, 1, 0, 0.5f)); + SLMaterial* cyan = new SLMaterial(am, + "mY", + SLCol4f(0, 1, 1, 0.5f)); + + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create a camera node 1 + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 5); + cam1->lookAt(0, 0, 0); + cam1->fov(CVCapture::instance()->activeCamera->calibration.cameraFovVDeg()); + cam1->background().texture(gVideoTexture); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.02f); + light1->translation(0.12f, 0.12f, 0.12f); + light1->name("light node"); + scene->addChild(light1); + + // Get the half edge length of the aruco marker + SLfloat edgeLen = static_cast(gVideoTracker)->params().edgeLength; + SLfloat he = edgeLen * 0.5f; + + // Build mesh & node that will be tracked by the 1st marker (camera) + SLBox* box1 = new SLBox(am, + -he, + -he, + 0.0f, + he, + he, + 2 * he, + "Box 1", + yellow); + SLNode* boxNode1 = new SLNode(box1, + "Box Node 1"); + SLNode* axisNode1 = new SLNode(new SLCoordAxis(am), + "Axis Node 1"); + axisNode1->setDrawBitsRec(SL_DB_MESHWIRED, false); + axisNode1->scale(edgeLen); + boxNode1->addChild(axisNode1); + boxNode1->setDrawBitsRec(SL_DB_CULLOFF, true); + scene->addChild(boxNode1); + + // The tracker moves the box node + gVideoTrackedNode = boxNode1; + + // Set active camera + sv->camera(cam1); + + // Turn on constant redraw + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.h new file mode 100644 index 00000000..78d2708f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackAruco.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneVideoTrackAruco.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOARUCO_H +#define APPDEMOSCENEVIDEOARUCO_H + +#include + +//----------------------------------------------------------------------------- +//! Class for ARUCO marker tracking test scene +class AppDemoSceneVideoTrackAruco : public SLScene +{ +public: + AppDemoSceneVideoTrackAruco(SLSceneID sid); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.cpp new file mode 100644 index 00000000..936eb995 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.cpp @@ -0,0 +1,169 @@ +/** + * \file AppDemoSceneVideoChessboard.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackChessboard::AppDemoSceneVideoTrackChessboard(SLSceneID sid) + : SLScene("Chessboard Video"), + _sceneID(sid) +{ + switch (_sceneID) + { + case SID_VideoTrackChessMain: name("Track Chessboard (main cam.)"); break; + case SID_VideoTrackChessScnd: name("Track Chessboard (scnd.cam.)"); break; + case SID_VideoCalibrateMain: name("Calibrate Main Camera"); break; + case SID_VideoCalibrateScnd: name("Calibrate Scnd. Camera"); break; + default: name("Unknow SceneID"); + } +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackChessboard::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + // Create a chessboard tracker + al.addLoadTask([]() + { + gVideoTracker = new CVTrackedChessboard(AppCommon::calibIniPath); + gVideoTracker->drawDetection(true); }); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackChessboard::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + /* + The tracking of markers is done in AppDemoVideo::onUpdateTracking by + calling the specific CVTracked::track method. If a marker was found it + overwrites the linked nodes object matrix (SLNode::_om). + If the linked node is the active camera the found transform is additionally + inversed. This would be the standard augmented realty use case. + The chessboard marker used in these scenes is also used for the camera + calibration. The different calibration state changes are also handled in + AppDemoVideo::onUpdateVideo. + */ + // Setup here only the requested scene. + if (_sceneID == SID_VideoTrackChessMain || + _sceneID == SID_VideoTrackChessScnd) + { + if (_sceneID == SID_VideoTrackChessMain) + CVCapture::instance()->videoType(VT_MAIN); + else + CVCapture::instance()->videoType(VT_SCND); + } + else if (_sceneID == SID_VideoCalibrateMain) + { + if (AppCommon::calibrationEstimator) + { + delete AppCommon::calibrationEstimator; + AppCommon::calibrationEstimator = nullptr; + } + CVCapture::instance()->videoType(VT_MAIN); + } + else if (_sceneID == SID_VideoCalibrateScnd) + { + if (AppCommon::calibrationEstimator) + { + delete AppCommon::calibrationEstimator; + AppCommon::calibrationEstimator = nullptr; + } + CVCapture::instance()->videoType(VT_SCND); + } + + // Material + SLMaterial* yellow = new SLMaterial(am, + "mY", + SLCol4f(1, 1, 0, 0.5f)); + + // set the edge length of a chessboard square + SLfloat e1 = 0.028f; + SLfloat e3 = e1 * 3.0f; + SLfloat e9 = e3 * 3.0f; + + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + + // Create a camera node + SLCamera* cam1 = new SLCamera(); + cam1->name("camera node"); + cam1->translation(0, 0, 5); + cam1->lookAt(0, 0, 0); + cam1->focalDist(5); + cam1->clipFar(10); + cam1->fov(CVCapture::instance()->activeCamera->calibration.cameraFovVDeg()); + cam1->background().texture(gVideoTexture); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + scene->addChild(cam1); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, e1 * 0.5f); + light1->translate(e9, e9, e9); + light1->name("light node"); + scene->addChild(light1); + + // Build mesh & node + if (_sceneID == SID_VideoTrackChessMain || + _sceneID == SID_VideoTrackChessScnd) + { + SLBox* box = new SLBox(am, + 0.0f, + 0.0f, + 0.0f, + e3, + e3, + e3, + "Box", + yellow); + SLNode* boxNode = new SLNode(box, + "Box Node"); + boxNode->setDrawBitsRec(SL_DB_CULLOFF, true); + SLNode* axisNode = new SLNode(new SLCoordAxis(am), + "Axis Node"); + axisNode->setDrawBitsRec(SL_DB_MESHWIRED, false); + axisNode->scale(e3); + boxNode->addChild(axisNode); + scene->addChild(boxNode); + } + + // The tracker moves the camera node + gVideoTrackedNode = cam1; + + // pass the scene group as root node + root3D(scene); + + // Set active camera + sv->camera(cam1); + sv->doWaitOnIdle(false); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.h new file mode 100644 index 00000000..a8056d9d --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackChessboard.h @@ -0,0 +1,49 @@ +/** + * \file AppDemoSceneVideoTrackChessboard.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOTRACKCHESS_H +#define APPDEMOSCENEVIDEOTRACKCHESS_H + +#include + +//----------------------------------------------------------------------------- +//! Class for video for chessboard test scene +class AppDemoSceneVideoTrackChessboard : public SLScene +{ +public: + AppDemoSceneVideoTrackChessboard(SLSceneID sid); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.cpp new file mode 100644 index 00000000..0cb29c9a --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.cpp @@ -0,0 +1,120 @@ +/** + * \file AppDemoSceneVideoTrackFace.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackFace::AppDemoSceneVideoTrackFace(SLSceneID sid) + : SLScene("Facial Feature Tracking"), + _sceneID(sid) +{ + if (_sceneID == SID_VideoTrackFaceMain) + name("Track Face (main cam.)"); + else + name("Track Face (scnd. cam.)"); + info("Face and facial landmark detection."); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackFace::registerAssetsToLoad(SLAssetLoader& al) +{ + al.addNodeToLoad(_glasses, + AppCommon::modelPath + + "FBX/Sunglasses.fbx"); + + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + // Create a face tracker + al.addLoadTask([]() + { + gVideoTracker = new CVTrackedFaces(AppCommon::calibIniPath + "haarcascade_frontalface_alt2.xml", + AppCommon::calibIniPath + "lbfmodel.yaml", + 3); + gVideoTracker->drawDetection(true); }); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackFace::assemble(SLAssetManager* am, SLSceneView* sv) +{ +#ifndef SL_EMSCRIPTEN + /* + The tracking of markers is done in AppDemoVideo::onUpdateVideo by calling + the specific CVTracked::track method. If a marker was found it overwrites + the linked nodes object matrix (SLNode::_om). If the linked node is the + active camera the found transform is additionally inversed. + This would be the standard augmented reality use case. + */ + if (_sceneID == SID_VideoTrackFaceMain) + CVCapture::instance()->videoType(VT_MAIN); + else + CVCapture::instance()->videoType(VT_SCND); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 0.5f); + cam1->clipNear(0.1f); + cam1->clipFar(1000.0f); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 10, + 10, + 10, + 1); + light1->powers(1.0f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + + // configure sunglasses + _glasses->scale(0.008f); + _glasses->translate(0, 1.5f, 0); + + // Add axis arrows at world center + SLNode* axis = new SLNode(new SLCoordAxis(am), "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->scale(0.03f); + + // Scene structure + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(cam1); + scene->addChild(_glasses); + scene->addChild(axis); + + // The tracker moves the camera node + gVideoTrackedNode = cam1; + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +#endif +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.h new file mode 100644 index 00000000..3495b720 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFace.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneVideoTrackFace.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOFACE_H +#define APPDEMOSCENEVIDEOFACE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for facial feature tracking test scene +class AppDemoSceneVideoTrackFace : public SLScene +{ +public: + AppDemoSceneVideoTrackFace(SLSceneID sid); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLSceneID _sceneID; + SLNode* _glasses; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.cpp new file mode 100644 index 00000000..f6835978 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.cpp @@ -0,0 +1,120 @@ +/** + * \file AppDemoSceneVideoTrackFeatures.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackFeatures::AppDemoSceneVideoTrackFeatures() + : SLScene("2D Feature Tracking") +{ + info("Augmented Reality 2D Feature Tracking: You need to print out the " + "stones image target from the file data/calibrations/vuforia_markers.pdf"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackFeatures::registerAssetsToLoad(SLAssetLoader& al) +{ + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + // Create feature tracker + al.addLoadTask([]() { + gVideoTracker = new CVTrackedFeatures(AppCommon::texturePath + "features_stones.jpg"); + gVideoTracker->drawDetection(true); + }); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackFeatures::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + /* + The tracking of markers is done in AppDemoVideo::onUpdateVideo by calling + the specific CVTracked::track method. If a marker was found it overwrites + the linked nodes object matrix (SLNode::_om). If the linked node is the + active camera the found transform is additionally inversed. This would be + the standard augmented reality use case. + */ + + CVCapture::instance()->videoType(VT_MAIN); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 2, 60); + cam1->lookAt(15, 15, 0); + cam1->clipNear(0.1f); + cam1->clipFar(1000.0f); // Increase to infinity? + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + cam1->background().texture(gVideoTexture); + + SLLightSpot* light1 = new SLLightSpot(am, + this, + 420, + 420, + 420, + 1); + light1->powers(1.0f, 1.0f, 1.0f); + + // Coordinate axis node + SLNode* axis = new SLNode(new SLCoordAxis(am), + "Axis Node"); + axis->setDrawBitsRec(SL_DB_MESHWIRED, false); + axis->scale(100); + axis->rotate(-90, 1, 0, 0); + + // Yellow center box + SLMaterial* yellow = new SLMaterial(am, + "mY", + SLCol4f(1, 1, 0, 0.5f)); + SLNode* box = new SLNode(new SLBox(am, + 0, + 0, + 0, + 100, + 100, + 100, + "Box", + yellow), + "Box Node"); + box->rotate(-90, 1, 0, 0); + + // Scene structure + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(axis); + scene->addChild(box); + scene->addChild(cam1); + + // The tracker moves the camera + gVideoTrackedNode = cam1; + + sv->doWaitOnIdle(false); // for constant video feed + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.h new file mode 100644 index 00000000..c5ec566e --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackFeatures.h @@ -0,0 +1,48 @@ +/** + * \file AppDemoSceneVideoTrackFeatures.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOFEATURE_H +#define APPDEMOSCENEVIDEOFEATURE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for feature tracking test scene +class AppDemoSceneVideoTrackFeatures : public SLScene +{ +public: + AppDemoSceneVideoTrackFeatures(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.cpp new file mode 100644 index 00000000..7ab7815f --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.cpp @@ -0,0 +1,79 @@ +/** + * \file AppDemoSceneVideoTrackMediapipe.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include + +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackMediapipe::AppDemoSceneVideoTrackMediapipe() + : SLScene("Mediapipe Hand Tracking") +{ + info("Mediapipe Hand Tracking"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackMediapipe::registerAssetsToLoad(SLAssetLoader& al) +{ +#ifdef SL_BUILD_WITH_MEDIAPIPE + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + // Create MediaPipe hand tracker + al.addLoadTask([] { + gVideoTracker = new CVTrackedMediaPipeHands(AppCommon::dataPath); + gVideoTracker->drawDetection(true); + }); +#endif +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackMediapipe::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + /* + The tracking of markers is done in AppDemoVideo::onUpdateVideo by calling + the specific CVTracked::track method. If a marker was found it overwrites + the linked nodes object matrix (SLNode::_om). If the linked node is the + active camera the found transform is additionally inversed. This would be + the standard augmented reality use case. + */ + +#ifdef SL_BUILD_WITH_MEDIAPIPE + CVCapture::instance()->videoType(VT_MAIN); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->background().texture(gVideoTexture); + + SLNode* scene = new SLNode("Scene"); + root3D(scene); + + // The tracker moves the camera + gVideoTrackedNode = cam1; + + sv->doWaitOnIdle(false); + sv->camera(cam1); +#endif +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.h new file mode 100644 index 00000000..a84b86ec --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackMediapipe.h @@ -0,0 +1,48 @@ +/** + * \file AppDemoSceneVideoTrackMediapipe.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOTRACKMEDIAPIPE_H +#define APPDEMOSCENEVIDEOTRACKMEDIAPIPE_H + +#include + +//----------------------------------------------------------------------------- +//! Class for Mediapipe hand tracking test scene +class AppDemoSceneVideoTrackMediapipe : public SLScene +{ +public: + AppDemoSceneVideoTrackMediapipe(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.cpp b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.cpp new file mode 100644 index 00000000..74c4a349 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.cpp @@ -0,0 +1,129 @@ +/** + * \file AppDemoSceneVideoTrackWAI.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef SL_BUILD_WAI +# include +#endif + +#ifdef SL_BUILD_WAI +// Global pointers declared in AppDemoVideo +extern SLGLTexture* gVideoTexture; +extern CVTracked* gVideoTracker; +extern SLNode* gVideoTrackedNode; +#endif + +//----------------------------------------------------------------------------- +AppDemoSceneVideoTrackWAI::AppDemoSceneVideoTrackWAI() + : SLScene("Feature Tracking with ORBSLAM library") +{ + info("Feature Tracking with ORBSLAM library"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVideoTrackWAI::registerAssetsToLoad(SLAssetLoader& al) +{ +#ifdef SL_BUILD_WAI + // Create video texture on global pointer updated in AppDemoVideo + al.addTextureToLoad(gVideoTexture, + AppCommon::texturePath + + "LiveVideoError.png", + GL_LINEAR, + GL_LINEAR); + + al.addLoadTask([] { + // Create OpenCV Tracker for the box node + std::string vocFileName; +# if USE_FBOW + vocFileName = "voc_fbow.bin"; +# else + vocFileName = "ORBvoc.bin"; +# endif + gVideoTracker = new CVTrackedWAI(Utils::findFile(vocFileName, {AppCommon::calibIniPath, AppCommon::exePath})); + gVideoTracker->drawDetection(true); + }); +#endif +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVideoTrackWAI::assemble(SLAssetManager* am, + SLSceneView* sv) +{ +#ifdef SL_BUILD_WAI + CVCapture::instance()->videoType(VT_MAIN); + + // Material + SLMaterial* yellow = new SLMaterial(am, + "mY", + SLCol4f(1, 1, 0, 0.5f)); + SLMaterial* cyan = new SLMaterial(am, + "mY", + SLCol4f(0, 1, 1, 0.5f)); + + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + root3D(scene); + + // Create a camera node 1 + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 5); + cam1->lookAt(0, 0, 0); + cam1->fov(CVCapture::instance()->activeCamera->calibration.cameraFovVDeg()); + cam1->background().texture(gVideoTexture); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.02f); + light1->translation(0.12f, 0.12f, 0.12f); + light1->name("light node"); + scene->addChild(light1); + + // Get the half edge length of the aruco marker + SLfloat edgeLen = 0.1f; + SLfloat he = edgeLen * 0.5f; + + // Build mesh & node that will be tracked by the 1st marker (camera) + SLBox* box1 = new SLBox(am, + -he, + -he, + -he, + he, + he, + he, + "Box 1", + yellow); + SLNode* boxNode1 = new SLNode(box1, "Box Node 1"); + SLNode* axisNode1 = new SLNode(new SLCoordAxis(am), "Axis Node 1"); + axisNode1->setDrawBitsRec(SL_DB_MESHWIRED, false); + axisNode1->scale(edgeLen); + axisNode1->translate(-he, -he, -he, TS_parent); + boxNode1->addChild(axisNode1); + boxNode1->setDrawBitsRec(SL_DB_CULLOFF, true); + boxNode1->translate(0.0f, 0.0f, 1.0f, TS_world); + scene->addChild(boxNode1); + + // The tracker moves the box node + gVideoTrackedNode = cam1; + + sv->camera(cam1); + sv->doWaitOnIdle(false); +#endif +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.h b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.h new file mode 100644 index 00000000..c9aadd42 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVideoTrackWAI.h @@ -0,0 +1,48 @@ +/** + * \file AppDemoSceneVideoTrackWAI.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVIDEOTRACKWAI_H +#define APPDEMOSCENEVIDEOTRACKWAI_H + +#include + +//----------------------------------------------------------------------------- +//! Class for Mediapipe hand tracking test scene +class AppDemoSceneVideoTrackWAI : public SLScene +{ +public: + AppDemoSceneVideoTrackWAI(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.cpp b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.cpp new file mode 100644 index 00000000..fb40f38e --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.cpp @@ -0,0 +1,117 @@ +/** + * \file AppDemoSceneVolumeRayCast.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneVolumeRayCast::AppDemoSceneVolumeRayCast() + : SLScene("Volume Ray Cast Test") +{ + info("Volume Rendering of an angiographic MRI scan"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVolumeRayCast::registerAssetsToLoad(SLAssetLoader& al) +{ + // Load volume data into 3D texture + SLVstring mriImages; + for (SLint i = 0; i < 207; ++i) + mriImages.push_back(Utils::formatString(al.texturePath() + "i%04u_0000b.png", i)); + + SLint clamping3D = GL_CLAMP_TO_EDGE; + if (SLGLState::instance()->getSLVersionNO() > "320") + clamping3D = 0x812D; // GL_CLAMP_TO_BORDER + + al.addTextureToLoad(_mriTex3D, + mriImages, + GL_LINEAR, + GL_LINEAR, + clamping3D, + clamping3D, + "mri_head_front_to_back", + false); + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "VolumeRenderingRayCast.vert", + AppCommon::shaderPath + "VolumeRenderingRayCast.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVolumeRayCast::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create transfer LUT 1D texture + SLVAlphaLUTPoint tfAlphas = {SLAlphaLUTPoint(0.00f, 0.00f), + SLAlphaLUTPoint(0.01f, 0.75f), + SLAlphaLUTPoint(1.00f, 1.00f)}; + SLTexColorLUT* tf = new SLTexColorLUT(am, + tfAlphas, + CLUT_BCGYR); + + // Load shader and uniforms for volume size + SLGLUniform1f* volX = new SLGLUniform1f(UT_const, + "u_volumeX", + (SLfloat)_mriTex3D->images()[0]->width()); + SLGLUniform1f* volY = new SLGLUniform1f(UT_const, + "u_volumeY", + (SLfloat)_mriTex3D->images()[0]->height()); + SLGLUniform1f* volZ = new SLGLUniform1f(UT_const, + "u_volumeZ", + (SLfloat)_mriTex3D->images().size()); + _sp->addUniform1f(volX); + _sp->addUniform1f(volY); + _sp->addUniform1f(volZ); + + // Create volume rendering material + SLMaterial* matVR = new SLMaterial(am, + "matVR", + _mriTex3D, + tf, + nullptr, + nullptr, + _sp); + + // Create camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 3); + cam1->lookAt(0, 0, 0); + cam1->focalDist(3); + cam1->background().colors(SLCol4f(0, 0, 0)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Set light + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + light1->translation(5, 5, 5); + + // Assemble scene with box node + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(new SLNode(new SLBox(am, + -1, + -1, + -1, + 1, + 1, + 1, + "Box", + matVR))); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.h b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.h new file mode 100644 index 00000000..27779e68 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCast.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneVolumeRayCast.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVOLUMERAYCAST_H +#define APPDEMOSCENEVOLUMERAYCAST_H + +#include + +//----------------------------------------------------------------------------- +//! Class for test scene for volume rendering of an angiographic MRI scan" +class AppDemoSceneVolumeRayCast : public SLScene +{ +public: + AppDemoSceneVolumeRayCast(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _mriTex3D; + SLGLProgram* _sp; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.cpp b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.cpp new file mode 100644 index 00000000..d435b2f3 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.cpp @@ -0,0 +1,123 @@ +/** + * \file AppDemoSceneVolumeRayCastLighted.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneVolumeRayCastLighted::AppDemoSceneVolumeRayCastLighted() + : SLScene("Lighted Volume Ray Cast Test") +{ + info("Lighted Volume Rendering of an angiographic MRI scan"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneVolumeRayCastLighted::registerAssetsToLoad(SLAssetLoader& al) +{ + // Load volume data into 3D texture + SLVstring mriImages; + for (SLint i = 0; i < 207; ++i) + mriImages.push_back(Utils::formatString(al.texturePath() + "i%04u_0000b.png", i)); + + SLint clamping3D = GL_CLAMP_TO_EDGE; + if (SLGLState::instance()->getSLVersionNO() > "320") + clamping3D = 0x812D; // GL_CLAMP_TO_BORDER + + al.addTextureToLoad(_mriTex3D, + mriImages, + GL_LINEAR, + GL_LINEAR, + clamping3D, + clamping3D, + "mri_head_front_to_back", + true); + + al.addLoadTask([=]() { _mriTex3D->calc3DGradients(1); }); + //al.addLoadTask([=]() + // { _mriTex3D->smooth3DGradients(1); }); + + al.addProgramToLoad(_sp, + AppCommon::shaderPath + "VolumeRenderingRayCast.vert", + AppCommon::shaderPath + "VolumeRenderingRayCastLighted.frag"); +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneVolumeRayCastLighted::assemble(SLAssetManager* am, + SLSceneView* sv) +{ + // Create transfer LUT 1D texture + SLVAlphaLUTPoint tfAlphas = {SLAlphaLUTPoint(0.00f, 0.00f), + SLAlphaLUTPoint(0.01f, 0.75f), + SLAlphaLUTPoint(1.00f, 1.00f)}; + SLTexColorLUT* tf = new SLTexColorLUT(am, + tfAlphas, + CLUT_BCGYR); + + SLGLUniform1f* volX = new SLGLUniform1f(UT_const, + "u_volumeX", + (SLfloat)_mriTex3D->images()[0]->width()); + SLGLUniform1f* volY = new SLGLUniform1f(UT_const, + "u_volumeY", + (SLfloat)_mriTex3D->images()[0]->height()); + SLGLUniform1f* volZ = new SLGLUniform1f(UT_const, + "u_volumeZ", + (SLfloat)_mriTex3D->images().size()); + _sp->addUniform1f(volX); + _sp->addUniform1f(volY); + _sp->addUniform1f(volZ); + + // Create volume rendering material + SLMaterial* matVR = new SLMaterial(am, + "matVR", + _mriTex3D, + tf, + nullptr, + nullptr, + _sp); + + // Create camera + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->translation(0, 0, 3); + cam1->lookAt(0, 0, 0); + cam1->focalDist(3); + cam1->background().colors(SLCol4f(0, 0, 0)); + cam1->setInitialState(); + cam1->devRotLoc(&AppCommon::devRot, &AppCommon::devLoc); + + // Set light + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->powers(0.1f, 1.0f, 1.0f); + light1->attenuation(1, 0, 0); + light1->translation(5, 5, 5); + + // Assemble scene with box node + SLNode* scene = new SLNode("Scene"); + root3D(scene); + scene->addChild(light1); + scene->addChild(new SLNode(new SLBox( + am, + -1, + -1, + -1, + 1, + 1, + 1, + "Box", + matVR))); + scene->addChild(cam1); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.h b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.h new file mode 100644 index 00000000..0e897687 --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneVolumeRayCastLighted.h @@ -0,0 +1,50 @@ +/** + * \file AppDemoSceneVolumeRayCastLighted.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEVOLUMERAYCASTLIGHTED_H +#define APPDEMOSCENEVOLUMERAYCASTLIGHTED_H + +#include + +//----------------------------------------------------------------------------- +//! Class for test scene for lighted volume rendering of an angiographic MRI scan" +class AppDemoSceneVolumeRayCastLighted : public SLScene +{ +public: + AppDemoSceneVolumeRayCastLighted(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; + +private: + SLGLTexture* _mriTex3D; + SLGLProgram* _sp; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo/source/scenes/AppDemoSceneZFighting.cpp b/apps/app_demo/source/scenes/AppDemoSceneZFighting.cpp new file mode 100644 index 00000000..d1c28daf --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneZFighting.cpp @@ -0,0 +1,89 @@ +/** + * \file AppDemoSceneZFighting.cpp + * \brief Implementation for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#include +#include +#include +#include +#include + +//----------------------------------------------------------------------------- +AppDemoSceneZFighting::AppDemoSceneZFighting() : SLScene("Z-Fighting Test Scene") +{ + info("The reason for this depth fighting is that the camera's near clipping distance " + "is almost zero and the far clipping distance is too large. The depth buffer only " + "has 32-bit precision, which leads to this fighting effect when the distance " + "between the near and far clipping planes is too large. You can adjust these " + "values over the menu Camera > Projection"); +} +//----------------------------------------------------------------------------- +//! All assets the should be loaded in parallel must be registered in here. +void AppDemoSceneZFighting::registerAssetsToLoad(SLAssetLoader& al) +{ +} +//----------------------------------------------------------------------------- +//! After parallel loading of the assets the scene gets assembled in here. +void AppDemoSceneZFighting::assemble(SLAssetManager* am, SLSceneView* sv) +{ + // Create a scene group node + SLNode* scene = new SLNode("scene node"); + this->root3D(scene); + + SLCamera* cam1 = new SLCamera("Camera 1"); + cam1->clipNear(0.0001f); + cam1->clipFar(1000000); + cam1->translation(0, 0, 4); + cam1->lookAt(0, 0, 0); + cam1->focalDist(4); + cam1->background().colors(SLCol4f(0.7f, 0.7f, 0.7f), + SLCol4f(0.2f, 0.2f, 0.2f)); + cam1->setInitialState(); + scene->addChild(cam1); + + // Create materials + SLMaterial* matR = new SLMaterial(am, "matR", SLCol4f::RED); + SLMaterial* matG = new SLMaterial(am, "matG", SLCol4f::GREEN); + + // Create a light source node + SLLightSpot* light1 = new SLLightSpot(am, this, 0.3f); + light1->translation(5, 0, 5); + light1->name("light node"); + scene->addChild(light1); + + // Create two squares + SLMesh* rectMeshR = new SLRectangle(am, + SLVec2f(-1, -1), + SLVec2f(1, 1), + 1, + 1, + "RectR", + matR); + SLNode* rectNodeR = new SLNode(rectMeshR, "Rect Node Red"); + scene->addChild(rectNodeR); + + SLMesh* rectMeshG = new SLRectangle(am, + SLVec2f(-0.8f, -0.8f), + SLVec2f(0.8f, 0.8f), + 1, + 1, + "RectG", + matG); + SLNode* rectNodeG = new SLNode(rectMeshG, "Rect Node Green"); + rectNodeG->rotate(2.0f, 1, 1, 0); + scene->addChild(rectNodeG); + + // Save energy + sv->doWaitOnIdle(true); + + sv->camera(cam1); +} +//----------------------------------------------------------------------------- diff --git a/apps/app_demo/source/scenes/AppDemoSceneZFighting.h b/apps/app_demo/source/scenes/AppDemoSceneZFighting.h new file mode 100644 index 00000000..6aa974fe --- /dev/null +++ b/apps/app_demo/source/scenes/AppDemoSceneZFighting.h @@ -0,0 +1,46 @@ +/** + * \file AppDemoSceneZFighting.h + * \brief Class declaration for an SLScene inherited class + * \details For more info about App framework and the scene assembly see: + * https://cpvrlab.github.io/SLProject4/app-framework.html + * \date May 2024 + * \authors Marino von Wattenwyl + * \copyright http://opensource.org/licenses/GPL-3.0 + * \remarks Please use clangformat to format the code. See more code style on + * https://github.com/cpvrlab/SLProject4/wiki/SLProject-Coding-Style +*/ + +#ifndef APPDEMOSCENEZFIGHTING_H +#define APPDEMOSCENEZFIGHTING_H + +#include + +//----------------------------------------------------------------------------- +//! Class for z-fighting test scene +class AppDemoSceneZFighting : public SLScene +{ +public: + AppDemoSceneZFighting(); + + //! All scene specific assets have to be registered for async loading in here. + /*! @remark All scene sspecific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there are + no OpenGL calls allowed. OpenGL calls are only allowed in the main thread.*/ + void registerAssetsToLoad(SLAssetLoader& al) override; + + //! After parallel loading of the assets the scene gets assembled in here. + /*! @remark All scene-specific assets have to be loaded async by overriding + SLScene::registerAssetsToLoad and SLScene::assemble. Async loading and + assembling means that it happens in a parallel thread and that in there + are no OpenGL calls allowed. OpenGL calls are only allowed in the main + thread. It is important that all object instantiations within + SLScene::assemble do NOT call any OpenGL functions (gl*) because they happen + in a parallel thread. All objects that get rendered have to do their + initialization when they are used the first time during rendering in the + main thread.*/ + void assemble(SLAssetManager* am, SLSceneView* sv) override; +}; +//----------------------------------------------------------------------------- + +#endif diff --git a/apps/app_demo_node/CMakeLists.txt b/apps/app_demo_node/CMakeLists.txt deleted file mode 100644 index 6a661e3c..00000000 --- a/apps/app_demo_node/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(glfw) diff --git a/apps/app_demo_node/glfw/AppNodeMainGLFW.cpp b/apps/app_demo_node/glfw/AppNodeMainGLFW.cpp deleted file mode 100644 index e923711b..00000000 --- a/apps/app_demo_node/glfw/AppNodeMainGLFW.cpp +++ /dev/null @@ -1,543 +0,0 @@ -//############################################################################# -// File: AppNodeMainGLFW.cpp -// Purpose: Implementation of the GUI with the GLFW3 (http://www.glfw.org/) -// Date: July 2014 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# - -#include -#include "AppNodeGui.h" -#include "AppNodeSceneView.h" -#include -#include -#include -#include -#include -#include - -extern void appNodeLoadScene(SLAssetManager* am, - SLScene* s, - SLSceneView* sv, - SLSceneID sid); - -//----------------------------------------------------------------------------- -// Global application variables -GLFWwindow* window; //!< The global GLFW window handle. -SLint svIndex; //!< SceneView index -SLint scrWidth; //!< Window width at start up -SLint scrHeight; //!< Window height at start up -SLint mouseX; //!< Last mouse position x in pixels -SLint mouseY; //!< Last mouse position y in pixels -SLint touchX2; //!< Last finger touch 2 position x in pixels -SLint touchY2; //!< Last finger touch 2 position y in pixels -SLint touchDeltaX; //!< Delta between two fingers in x -SLint touchDeltaY; //!< Delta between two fingers in < -SLint lastWidth; //!< Last window width in pixels -SLint lastHeight; //!< Last window height in pixels -SLint lastMouseWheelPos; //!< Last mouse wheel position -SLfloat lastMouseDownTime = 0.0f; //!< Last mouse press time -SLKey modifiers = K_none; //!< last modifier keys -SLbool fullscreen = false; //!< flag if window is in fullscreen mode -SLint dpi = 142; //!< Dot per inch resolution of screen - -//------------------------------------------------------------------------- -/*! -onClose event handler for deallocation of the scene & sceneview. onClose is -called glfwPollEvents, glfwWaitEvents or glfwSwapBuffers. -*/ -void onClose(GLFWwindow* myWindow) -{ - slShouldClose(true); -} - -//----------------------------------------------------------------------------- -/*! -onPaint: Paint event handler that passes the event to the slPaint function. -For accurate frame rate measurement we have to take the time after the OpenGL -frame buffer swapping. The FPS calculation is done in slGetWindowTitle. -*/ -SLbool onPaint() -{ - if (AppDemo::sceneViews.empty()) - return false; - SLSceneView* sv = AppDemo::sceneViews[svIndex]; - - /////////////////////////////////////////////// - bool jobIsRunning = slUpdateParallelJob(); - bool viewsNeedsRepaint = slPaintAllViews(); - /////////////////////////////////////////////// - - return jobIsRunning || viewsNeedsRepaint; -} -//----------------------------------------------------------------------------- -//! Maps the GLFW key codes to the SLKey codes -SLKey mapKeyToSLKey(SLint key) -{ - switch (key) - { - case GLFW_KEY_SPACE: return K_space; - case GLFW_KEY_ESCAPE: return K_esc; - case GLFW_KEY_F1: return K_F1; - case GLFW_KEY_F2: return K_F2; - case GLFW_KEY_F3: return K_F3; - case GLFW_KEY_F4: return K_F4; - case GLFW_KEY_F5: return K_F5; - case GLFW_KEY_F6: return K_F6; - case GLFW_KEY_F7: return K_F7; - case GLFW_KEY_F8: return K_F8; - case GLFW_KEY_F9: return K_F9; - case GLFW_KEY_F10: return K_F10; - case GLFW_KEY_F11: return K_F11; - case GLFW_KEY_F12: return K_F12; - case GLFW_KEY_UP: return K_up; - case GLFW_KEY_DOWN: return K_down; - case GLFW_KEY_LEFT: return K_left; - case GLFW_KEY_RIGHT: return K_right; - case GLFW_KEY_LEFT_SHIFT: return K_shift; - case GLFW_KEY_RIGHT_SHIFT: return K_shift; - case GLFW_KEY_LEFT_CONTROL: return K_ctrl; - case GLFW_KEY_RIGHT_CONTROL: return K_ctrl; - case GLFW_KEY_LEFT_ALT: return K_alt; - case GLFW_KEY_RIGHT_ALT: return K_alt; - case GLFW_KEY_LEFT_SUPER: return K_super; // Apple command key - case GLFW_KEY_RIGHT_SUPER: return K_super; // Apple command key - case GLFW_KEY_TAB: return K_tab; - case GLFW_KEY_ENTER: return K_enter; - case GLFW_KEY_BACKSPACE: return K_backspace; - case GLFW_KEY_INSERT: return K_insert; - case GLFW_KEY_DELETE: return K_delete; - case GLFW_KEY_PAGE_UP: return K_pageUp; - case GLFW_KEY_PAGE_DOWN: return K_pageDown; - case GLFW_KEY_HOME: return K_home; - case GLFW_KEY_END: return K_end; - case GLFW_KEY_KP_0: return K_NP0; - case GLFW_KEY_KP_1: return K_NP1; - case GLFW_KEY_KP_2: return K_NP2; - case GLFW_KEY_KP_3: return K_NP3; - case GLFW_KEY_KP_4: return K_NP4; - case GLFW_KEY_KP_5: return K_NP5; - case GLFW_KEY_KP_6: return K_NP6; - case GLFW_KEY_KP_7: return K_NP7; - case GLFW_KEY_KP_8: return K_NP8; - case GLFW_KEY_KP_9: return K_NP9; - case GLFW_KEY_KP_DIVIDE: return K_NPDivide; - case GLFW_KEY_KP_MULTIPLY: return K_NPMultiply; - case GLFW_KEY_KP_SUBTRACT: return K_NPSubtract; - case GLFW_KEY_KP_ADD: return K_NPAdd; - case GLFW_KEY_KP_DECIMAL: return K_NPDecimal; - case GLFW_KEY_UNKNOWN: return K_none; - default: break; - } - return (SLKey)key; -} -//----------------------------------------------------------------------------- -/*! -onResize: Event handler called on the resize event of the window. This event -should called once before the onPaint event. -*/ -static void onResize(GLFWwindow* myWindow, - int width, - int height) -{ - if (AppDemo::sceneViews.empty()) return; - SLSceneView* sv = AppDemo::sceneViews[svIndex]; - - lastWidth = width; - lastHeight = height; - - // width & height are in screen coords. - // We need to scale them to framebuffer coords. - slResize(svIndex, width, height); -} -//----------------------------------------------------------------------------- -/*! -Mouse button event handler forwards the events to the slMouseDown or slMouseUp. -Two finger touches of touch devices are simulated with ALT & CTRL modifiers. -*/ -static void onMouseButton(GLFWwindow* myWindow, - int button, - int action, - int mods) -{ - SLint x = mouseX; - SLint y = mouseY; - - // Translate modifiers - modifiers = K_none; - if (mods & GLFW_MOD_SHIFT) modifiers = (SLKey)(modifiers | K_shift); - if (mods & GLFW_MOD_CONTROL) modifiers = (SLKey)(modifiers | K_ctrl); - if (mods & GLFW_MOD_ALT) modifiers = (SLKey)(modifiers | K_alt); - - if (action == GLFW_PRESS) - { - // simulate double touch from touch devices - if (modifiers & K_alt) - { - // Do parallel double finger move - if (modifiers & K_shift) - slTouch2Down(svIndex, x, y, x - touchDeltaX, y - touchDeltaY); - else // Do concentric double finger pinch - slTouch2Down(svIndex, x, y, touchX2, touchY2); - } - else // Do standard mouse down - { - SLfloat mouseDeltaTime = (SLfloat)glfwGetTime() - lastMouseDownTime; - lastMouseDownTime = (SLfloat)glfwGetTime(); - - // handle double click - if (mouseDeltaTime < 0.3f) - { - switch (button) - { - case GLFW_MOUSE_BUTTON_LEFT: - slDoubleClick(svIndex, MB_left, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_RIGHT: - slDoubleClick(svIndex, MB_right, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_MIDDLE: - slDoubleClick(svIndex, MB_middle, x, y, modifiers); - break; - default: break; - } - } - else // normal mouse clicks - { - switch (button) - { - case GLFW_MOUSE_BUTTON_LEFT: - slMouseDown(svIndex, MB_left, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_RIGHT: - slMouseDown(svIndex, MB_right, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_MIDDLE: - slMouseDown(svIndex, MB_middle, x, y, modifiers); - break; - default: break; - } - } - } - } - else - { - // simulate double touch from touch devices - if (modifiers & K_alt) - { - // Do parallel double finger move - if (modifiers & K_shift) - { - slTouch2Up(svIndex, x, y, x - (touchX2 - x), y - (touchY2 - y)); - } - else // Do concentric double finger pinch - { - slTouch2Up(svIndex, x, y, touchX2, touchY2); - } - } - else // Do standard mouse down - { - switch (button) - { - case GLFW_MOUSE_BUTTON_LEFT: - slMouseUp(svIndex, MB_left, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_RIGHT: - slMouseUp(svIndex, MB_right, x, y, modifiers); - break; - case GLFW_MOUSE_BUTTON_MIDDLE: - slMouseUp(svIndex, MB_middle, x, y, modifiers); - break; - default: break; - } - } - } -} -//----------------------------------------------------------------------------- -/*! -Mouse move event handler forwards the events to slMouseMove or slTouch2Move. -*/ -static void onMouseMove(GLFWwindow* myWindow, - double x, - double y) -{ - // x & y are in screen coords. - mouseX = (int)x; - mouseY = (int)y; - - // Offset of 2nd. finger for two finger simulation - - // Simulate double finger touches - if (modifiers & K_alt) - { - // Do parallel double finger move - if (modifiers & K_shift) - { - slTouch2Move(svIndex, - (int)x, - (int)y, - (int)x - touchDeltaX, - (int)y - touchDeltaY); - } - else // Do concentric double finger pinch - { - int scrW2 = lastWidth / 2; - int scrH2 = lastHeight / 2; - touchX2 = scrW2 - ((int)x - scrW2); - touchY2 = scrH2 - ((int)y - scrH2); - touchDeltaX = (int)x - touchX2; - touchDeltaY = (int)y - touchY2; - slTouch2Move(svIndex, (int)x, (int)y, touchX2, touchY2); - } - } - else // Do normal mouse move - slMouseMove(svIndex, (int)x, (int)y); -} -//----------------------------------------------------------------------------- -/*! -Mouse wheel event handler forwards the events to slMouseWheel -*/ -static void onMouseWheel(GLFWwindow* myWindow, - double xscroll, - double yscroll) -{ - // make sure the delta is at least one integer - int dY = (int)yscroll; - if (dY == 0) dY = (int)(Utils::sign(yscroll)); - - slMouseWheel(svIndex, dY, modifiers); -} -//----------------------------------------------------------------------------- -/*! -Key action event handler sets the modifier key state & forwards the event to -the slKeyPress function. -*/ -static void onKeyAction(GLFWwindow* myWindow, - int GLFWKey, - int scancode, - int action, - int mods) -{ - SLKey key = mapKeyToSLKey(GLFWKey); - - if (action == GLFW_PRESS) - { - switch (key) - { - case K_ctrl: modifiers = (SLKey)(modifiers | K_ctrl); return; - case K_alt: modifiers = (SLKey)(modifiers | K_alt); return; - case K_shift: modifiers = (SLKey)(modifiers | K_shift); return; - } - } - else if (action == GLFW_RELEASE) - { - switch (key) - { - case K_ctrl: modifiers = (SLKey)(modifiers ^ K_ctrl); return; - case K_alt: modifiers = (SLKey)(modifiers ^ K_alt); return; - case K_shift: modifiers = (SLKey)(modifiers ^ K_shift); return; - } - } - - // Special treatment for ESC key - if (key == K_esc && action == GLFW_RELEASE) - { - if (fullscreen) - { - fullscreen = !fullscreen; - glfwSetWindowSize(myWindow, scrWidth, scrHeight); - glfwSetWindowPos(myWindow, 10, 30); - } - else - { - slKeyPress(svIndex, key, modifiers); // ESC during RT stops it and returns false - onClose(myWindow); - glfwSetWindowShouldClose(myWindow, GL_TRUE); - } - } - else if (key == K_F9 && action == GLFW_PRESS) // Toggle fullscreen mode - { - fullscreen = !fullscreen; - - if (fullscreen) - { - GLFWmonitor* primary = glfwGetPrimaryMonitor(); - const GLFWvidmode* mode = glfwGetVideoMode(primary); - glfwSetWindowSize(myWindow, mode->width, mode->height); - glfwSetWindowPos(myWindow, 0, 0); - } - else - { - glfwSetWindowSize(myWindow, scrWidth, scrHeight); - glfwSetWindowPos(myWindow, 10, 30); - } - } - else - { - if (action == GLFW_PRESS) - slKeyPress(svIndex, key, modifiers); - else if (action == GLFW_RELEASE) - slKeyRelease(svIndex, key, modifiers); - } -} -//----------------------------------------------------------------------------- -//! Error callback handler for GLFW. -void onGLFWError(int error, const char* description) -{ - fputs(description, stderr); -} -//----------------------------------------------------------------------------- -//! Alternative SceneView creation C-function passed by slCreateSceneView -SLSceneView* createAppNodeSceneView(SLScene* scene, - int myDPI, - SLInputManager& inputManager) -{ - return new AppNodeSceneView(scene, myDPI, inputManager); -} -//----------------------------------------------------------------------------- -//! Initialises all GLFW and GL3W stuff -void initGLFW(int screenWidth, int screenHeight) -{ - if (!glfwInit()) - { - fprintf(stderr, "Failed to initialize GLFW\n"); - exit(EXIT_FAILURE); - } - - glfwSetErrorCallback(onGLFWError); - - // Enable fullscreen anti aliasing with 4 samples - glfwWindowHint(GLFW_SAMPLES, 4); - -#ifdef __APPLE__ - // You can enable or restrict newer OpenGL context here (read the GLFW documentation) - glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); - glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); - glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE); - glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); - glfwWindowHint(GLFW_COCOA_RETINA_FRAMEBUFFER, GL_FALSE); -#endif - - window = glfwCreateWindow(screenWidth, screenHeight, "My Title", nullptr, nullptr); - - // get real window size - glfwGetWindowSize(window, &scrWidth, &scrHeight); - - if (!window) - { - glfwTerminate(); - exit(EXIT_FAILURE); - } - - // Get the current GL context. After this you can call GL - glfwMakeContextCurrent(window); - - // Init OpenGL access library gl3w - if (gl3wInit() != 0) - { - cerr << "Failed to initialize OpenGL" << endl; - exit(-1); - } - - glfwSetWindowTitle(window, "SLProject Test Application"); - glfwSetWindowPos(window, 10, 30); - - // With GLFW ImGui draws the cursor - glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_HIDDEN); - - // Set number of monitor refreshes between 2 buffer swaps - glfwSwapInterval(2); - - // Get GL errors that occurred before our framework is involved - GET_GL_ERROR; - - // Set your own physical screen dpi - Utils::log("SLProject", "------------------------------------------------------------------"); - Utils::log("SLProject", - "GUI-Framwork : GLFW (Version: %d.%d.%d", - GLFW_VERSION_MAJOR, - GLFW_VERSION_MINOR, - GLFW_VERSION_REVISION); - Utils::log("SLProject", - "Resolution (DPI) : %d", - dpi); - - // Set GLFW callback functions - glfwSetKeyCallback(window, onKeyAction); - glfwSetWindowSizeCallback(window, onResize); - glfwSetMouseButtonCallback(window, onMouseButton); - glfwSetCursorPosCallback(window, onMouseMove); - glfwSetScrollCallback(window, onMouseWheel); - glfwSetWindowCloseCallback(window, onClose); -} -//----------------------------------------------------------------------------- -/*! -The C main procedure running the GLFW GUI application. -*/ -int main(int argc, char* argv[]) -{ - // set command line arguments - SLVstring cmdLineArgs; - for (int i = 0; i < argc; i++) - cmdLineArgs.push_back(argv[i]); - - initGLFW(640, 480); - - // get executable path - SLstring projectRoot = SLstring(SL_PROJECT_ROOT); - SLstring configPath = Utils::getAppsWritableDir(); - - ////////////////////////////////////////////////////////// - slCreateAppAndScene(cmdLineArgs, - projectRoot + "/data/", - projectRoot + "/data/shaders/", - projectRoot + "/data/models/", - projectRoot + "/data/images/textures/", - projectRoot + "/data/images/fonts/", - projectRoot + "/data/videos/", - configPath, - "AppNode_GLFW", - (void*)appNodeLoadScene); - ////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////// - svIndex = slCreateSceneView(AppDemo::assetManager, - AppDemo::scene, - scrWidth, - scrHeight, - dpi, - (SLSceneID)0, - (void*)&onPaint, - nullptr, - (void*)createAppNodeSceneView, - (void*)AppNodeGui::build); - ////////////////////////////////////////////////////////// - - // Event loop - while (!slShouldClose()) - { - ///////////////////////////// - SLbool doRepaint = onPaint(); - ///////////////////////////// - - // Fast copy the back buffer to the front buffer. This is OS dependent. - glfwSwapBuffers(window); - - // Show the title generated by the scene library (FPS etc.) - glfwSetWindowTitle(window, slGetWindowTitle(svIndex).c_str()); - - // if no updated occurred wait for the next event (power saving) - if (!doRepaint) - glfwWaitEvents(); - else - glfwPollEvents(); - } - - slTerminate(); - glfwDestroyWindow(window); - glfwTerminate(); - exit(0); -} -//----------------------------------------------------------------------------- diff --git a/apps/app_demo_node/glfw/CMakeLists.txt b/apps/app_demo_node/glfw/CMakeLists.txt deleted file mode 100644 index e604a6fe..00000000 --- a/apps/app_demo_node/glfw/CMakeLists.txt +++ /dev/null @@ -1,92 +0,0 @@ -# -# CMake configuration for app-Demo-Node application -# - -set(target app-Demo-Node) -set(include_path "${CMAKE_CURRENT_SOURCE_DIR}") -set(source_path "${CMAKE_CURRENT_SOURCE_DIR}") - -file(GLOB headers - ${SL_PROJECT_ROOT}/apps/app_demo_node/include/AppNodeGui.h - ${SL_PROJECT_ROOT}/apps/app_demo_node/include/AppNodeSceneView.h - ${SL_PROJECT_ROOT}/apps/source/AppDemo.h - ${SL_PROJECT_ROOT}/apps/source/SLInterface.h - ${SL_PROJECT_ROOT}/apps/source/SLScene.h - ) - -file(GLOB sources - ${SL_PROJECT_ROOT}/apps/app_demo_node/source/AppNodeGui.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_node/source/AppNodeLoad.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_node/source/AppNodeSceneView.cpp - ${SL_PROJECT_ROOT}/apps/source/AppDemo.cpp - ${SL_PROJECT_ROOT}/apps/source/SLInterface.cpp - ${SL_PROJECT_ROOT}/apps/source/SLScene.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/AppNodeMainGLFW.cpp - ) - -add_executable( - ${target} - ${headers} - ${sources} -) - -enable_warnings(${target}) - -set_target_properties( - ${target} - PROPERTIES - ${DEFAULT_PROJECT_OPTIONS} - FOLDER "apps" -) - -target_include_directories( - ${target} - PRIVATE - ${SL_PROJECT_ROOT}/apps/app_demo_node/include - ${SL_PROJECT_ROOT}/modules/sl/source - ${SL_PROJECT_ROOT}/modules/sl/externals/imgui - ${SL_PROJECT_ROOT}/modules/sl/externals/spa - ${SL_PROJECT_ROOT}/modules/utils/externals/dirent - ${SL_PROJECT_ROOT}/modules/sl/externals/gl3w - ${SL_PROJECT_ROOT}/modules/sl/externals/gl3w/GL - ${SL_PROJECT_ROOT}/apps/source - ${OpenCV_INCLUDE_DIR} - ${glfw_INCLUDE_DIR} - PUBLIC - INTERFACE -) - -target_link_libraries( - ${target} - PRIVATE - sl - ${glfw_LIBS} - PUBLIC - INTERFACE -) - -target_compile_definitions( - ${target} - PRIVATE - ${compile_definitions} - PUBLIC - ${DEFAULT_COMPILE_DEFINITIONS} - INTERFACE -) - -target_compile_options( - ${target} - PRIVATE - PUBLIC - ${DEFAULT_COMPILE_OPTIONS} - INTERFACE -) - -target_link_libraries( - ${target} - PRIVATE - PUBLIC - ${DEFAULT_LINKER_OPTIONS} - INTERFACE -) - diff --git a/apps/app_demo_node/source/AppNodeGui.cpp b/apps/app_demo_node/source/AppNodeGui.cpp deleted file mode 100644 index 3a81d9f3..00000000 --- a/apps/app_demo_node/source/AppNodeGui.cpp +++ /dev/null @@ -1,33 +0,0 @@ -//############################################################################# -// File: AppNodeGui.cpp -// Date: Summer 2017 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# - -#include "AppNodeGui.h" -#include "AppNodeSceneView.h" -#include -#include -#include -//----------------------------------------------------------------------------- -SLstring AppNodeGui::infoText = ""; -//----------------------------------------------------------------------------- -//! Creates the ImGui UI. -/*! This function must be passed as void* pointer to the slCreateSceneView -function. It is called in SLSceneView::onPaint for each frame. -*/ -void AppNodeGui::build(SLScene* s, SLSceneView* sv) -{ - ImGui::SetNextWindowPos(ImVec2(0.0f, 0.0f)); - ImGui::Begin("Scene Information", - 0, - ImGuiWindowFlags_NoResize | - ImGuiWindowFlags_AlwaysAutoResize | - ImGuiWindowFlags_NoMove); - ImGui::TextUnformatted(infoText.c_str()); - ImGui::End(); -} -//----------------------------------------------------------------------------- diff --git a/apps/app_demo_node/source/AppNodeLoad.cpp b/apps/app_demo_node/source/AppNodeLoad.cpp deleted file mode 100644 index fcef4cc2..00000000 --- a/apps/app_demo_node/source/AppNodeLoad.cpp +++ /dev/null @@ -1,49 +0,0 @@ -//############################################################################# -// File: AppNodeLoad.cpp -// Date: July 2015 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marc Wacker, Marcus Hudritsch -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# - -#include -#include -#include -#include -#include - -//----------------------------------------------------------------------------- -//! appNodeLoadScene builds a scene from source code. -/*! appDemoLoadScene builds a scene from source code. Such a function must be - passed as a void*-pointer to slCreateScene. It will be called from within - slCreateSceneView as soon as the view is initialized. You could separate - different scene by a different sceneID.
- The purpose is to assemble a scene by creating scenegraph objects with nodes - (SLNode) and meshes (SLMesh). See the scene with SID_Minimal for a minimal - example of the different steps. - */ -void appNodeLoadScene(SLAssetManager* am, SLScene* s, SLSceneView* sv, SLSceneID sid) -{ - s->init(am); - - SLCamera* cam1 = new SLCamera; - cam1->translation(2, 3, 5); - cam1->lookAt(-2, -1.0, 1); - cam1->focalDist(6); - cam1->background().colors(SLCol4f(0.8f, 0.8f, 0.8f)); - - SLLightSpot* light1 = new SLLightSpot(am, s, 0.3f); - light1->translation(10, 10, 10); - - SLNode* scene = new SLNode; - scene->addChild(light1); - scene->addChild(cam1); - - s->root3D(scene); - - sv->camera(cam1); - sv->doWaitOnIdle(false); - sv->onInitialize(); -} -//----------------------------------------------------------------------------- diff --git a/apps/app_demo_slproject/CMakeLists.txt b/apps/app_demo_slproject/CMakeLists.txt deleted file mode 100644 index 88c556e7..00000000 --- a/apps/app_demo_slproject/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -if("${SYSTEM_NAME_UPPER}" MATCHES "ANDROID") - set(IDE_FOLDER "Android") - add_subdirectory(android/app) -elseif("${SYSTEM_NAME_UPPER}" MATCHES "IOS") - add_subdirectory(iOS) -elseif("${SYSTEM_NAME_UPPER}" MATCHES "EMSCRIPTEN") - add_subdirectory(emscripten) -else() - add_subdirectory(glfw) -endif() diff --git a/apps/app_demo_slproject/android/app/CMakeLists.txt b/apps/app_demo_slproject/android/app/CMakeLists.txt deleted file mode 100644 index 826a03b2..00000000 --- a/apps/app_demo_slproject/android/app/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -# For more information about using CMake with Android Studio, read the -# documentation: https://d.android.com/studio/projects/add-native-code.html -#find_package(OpenCV REQUIRED) - -cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - -set(target native-lib) - -set(sources - ${CMAKE_CURRENT_SOURCE_DIR}/src/main/cpp/AppDemoAndroidJNI.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/source/AppDemoGui.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/source/AppDemoLoad.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/source/AppDemoVideo.cpp - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/source/AppDemoSceneView.cpp - ${SL_PROJECT_ROOT}/apps/source/CVCapture.cpp - ${SL_PROJECT_ROOT}/apps/source/AppDemo.cpp - ${SL_PROJECT_ROOT}/apps/source/SLInterface.cpp - ) - -set(headers - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/include/AppDemoGui.h - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/include/AppDemoSceneView.h - ${SL_PROJECT_ROOT}/apps/source/CVCapture.h - ${SL_PROJECT_ROOT}/apps/source/AppDemo.h - ${SL_PROJECT_ROOT}/apps/source/SLInterface.h - ) - -add_library(${target} - SHARED - ${sources} - ${headers} - ) - -enable_warnings(${target}) - -target_include_directories( - ${target} - PRIVATE - PUBLIC - ${SL_PROJECT_ROOT}/apps/app_demo_slproject/include - ${SL_PROJECT_ROOT}/apps/source - ${SL_PROJECT_ROOT}/modules/sl/source - ${SL_PROJECT_ROOT}/modules/sl/externals/imgui - ${SL_PROJECT_ROOT}/modules/sl/externals/spa - ${SL_PROJECT_ROOT}/modules/utils/externals/zlib - ${DEFAULT_INCLUDE_DIRECTORIES} - ) - -target_link_libraries( - ${target} - ${DEFAULT_LIBRARIES} - libc++_shared.so - ${OpenCV_LIBS} - ${META_PROJECT_NAME}::sl - z - ) - -# -# Copy APK contents -# -include(CopyResourcesAppDemoSLProject) -copy_resources_slprojectdemo("${CMAKE_CURRENT_LIST_DIR}/src/main/assets/data") - -file(GLOB_RECURSE - MEDIAPIPE_ASSETS - ${SL_PROJECT_ROOT}/data/mediapipe/*.tflite - ${SL_PROJECT_ROOT}/data/mediapipe/*.txt -) - -foreach (ASSET_PATH ${MEDIAPIPE_ASSETS}) - get_filename_component(ASSET_FILENAME "${ASSET_PATH}" NAME) - message(STATUS "${TARGET_DIR}/${ASSET_FILENAME}") - file(COPY "${ASSET_PATH}" DESTINATION "${CMAKE_CURRENT_LIST_DIR}/src/main/assets") -endforeach () diff --git a/apps/app_demo_slproject/android/app/src/main/cpp/AppDemoAndroidJNI.cpp b/apps/app_demo_slproject/android/app/src/main/cpp/AppDemoAndroidJNI.cpp deleted file mode 100644 index 5450357e..00000000 --- a/apps/app_demo_slproject/android/app/src/main/cpp/AppDemoAndroidJNI.cpp +++ /dev/null @@ -1,358 +0,0 @@ -//############################################################################# -// File: AppDemoAndroidJNI.cpp -// Date: Spring 2017 -// Purpose: Android Java native interface into the SLProject C++ library -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marcus Hudritsch, Zingg Pascal -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -//############################################################################# - -#include -#include -#include -#include -#include -#include -#include - -#include "mediapipe.h" - -//----------------------------------------------------------------------------- -// Some global variable for the JNI interface -JNIEnv* environment; //! Pointer to JAVA environment used in ray tracing callback -int svIndex = 0; //!< SceneView index -//----------------------------------------------------------------------------- -/*! Java Native Interface (JNI) function declarations. These functions are -called by the Java interface class GLES3Lib. The function name follows the pattern -Java_{package name}_{JNI class name}_{function name}(JNIEnv* env,jclass obj,*); -The functions mostly forward to the C-Interface functions of SLProject declared -in SLInterface.h. -*/ -extern "C" { -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onInit(JNIEnv* env, jclass obj, jint width, jint height, jint dpi, jstring filePath); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTerminate(JNIEnv* env, jclass obj); -JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onUpdateVideo(JNIEnv* env, jclass obj); -JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onUpdateParallelJob(JNIEnv* env, jclass obj); -JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onPaintAllViews(JNIEnv* env, jclass obj); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onResize(JNIEnv* env, jclass obj, jint width, jint height); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseDown(JNIEnv* env, jclass obj, jint button, jint x, jint y); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseUp(JNIEnv* env, jclass obj, jint button, jint x, jint y); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseMove(JNIEnv* env, jclass obj, jint x, jint y); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Down(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Move(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Up(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onDoubleClick(JNIEnv* env, jclass obj, jint button, jint x, jint y); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onClose(JNIEnv* env, jclass obj); -JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_usesRotation(JNIEnv* env, jclass obj); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onRotationQUAT(JNIEnv* env, jclass obj, jfloat quatX, jfloat quatY, jfloat quatZ, jfloat quatW); -JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_usesLocation(JNIEnv* env, jclass obj); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onLocationLatLonAlt(JNIEnv* env, jclass obj, jdouble latitudeDEG, jdouble longitudeDEG, jdouble altitudeM, jfloat accuracyM); -JNIEXPORT jint JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_getVideoType(JNIEnv* env, jclass obj); -JNIEXPORT jint JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_getVideoSizeIndex(JNIEnv* env, jclass obj); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_grabVideoFileFrame(JNIEnv* env, jclass obj); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_copyVideoImage(JNIEnv* env, jclass obj, jint imgWidth, jint imgHeight, jbyteArray srcBuffer); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onSetupExternalDir(JNIEnv* env, jclass obj, jstring externalDirPath); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_copyVideoYUVPlanes(JNIEnv* env, jclass obj, jint srcW, jint srcH, jbyteArray yBuf, jint ySize, jint yPixStride, jint yLineStride, jbyteArray uBuf, jint uSize, jint uPixStride, jint uLineStride, jbyteArray vBuf, jint vSize, jint vPixStride, jint vLineStride); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_setCameraSize(JNIEnv* env, jclass obj, jint sizeIndex, jint sizeIndexMax, jint width, jint height); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_setDeviceParameter(JNIEnv* env, jclass obj, jstring parameter, jstring value); -JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_initMediaPipeAssetManager(JNIEnv* env, jclass obj, jobject assetManager, jstring cacheDirPath); -}; - -//----------------------------------------------------------------------------- -// external functions application code not in SLProject -extern void appDemoLoadScene(SLAssetManager* am, SLScene* s, SLSceneView* sv, SLSceneID sceneID); -extern bool onUpdateVideo(); -//----------------------------------------------------------------------------- -//! Native ray tracing callback function that calls the Java class method GLES3Lib.RaytracingCallback -bool Java_renderRaytracingCallback() -{ - jclass klass = environment->FindClass("ch/bfh/cpvrlab/GLES3Lib"); - jmethodID method = environment->GetStaticMethodID(klass, "RaytracingCallback", "()Z"); - return environment->CallStaticBooleanMethod(klass, method); -} -//----------------------------------------------------------------------------- -//! Native OpenGL info string print functions used in onInit -static void printGLString(const char* name, GLenum s) -{ - const char* v = (const char*)glGetString(s); - SL_LOG("GL %s = %s\n", name, v); -} -//----------------------------------------------------------------------------- -std::string jstring2stdstring(JNIEnv* env, jstring jStr) -{ - if (!jStr) return ""; - jboolean isCopy; - const char* chars = env->GetStringUTFChars(jStr, &isCopy); - std::string stdString(chars); - env->ReleaseStringUTFChars(jStr, chars); - return stdString; -} -//----------------------------------------------------------------------------- -//! Alternative SceneView creation C-function passed by slCreateSceneView -SLSceneView* createAppDemoSceneView(SLScene* scene, int dpi, SLInputManager& inputManger) -{ - return new AppDemoSceneView(scene, dpi, inputManger); -} -//----------------------------------------------------------------------------- -//! Creates the scene and sceneview instance -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onInit(JNIEnv* env, jclass obj, jint width, jint height, jint dpi, jstring filePath) -{ - environment = env; - const char* nativeString = env->GetStringUTFChars(filePath, 0); - string devicePath(nativeString); - env->ReleaseStringUTFChars(filePath, nativeString); - - SLVstring* cmdLineArgs = new SLVstring(); - - SL_LOG("GUI : Android"); - - string device_path_msg = "Device path:" + devicePath; - SL_LOG(device_path_msg.c_str(), 0); - - AppDemo::calibFilePath = devicePath + "/data/config/"; //that's where calibrations are stored an loaded from - AppDemo::calibIniPath = devicePath + "/data/calibrations/"; - CVCapture::instance()->loadCalibrations(Utils::ComputerInfos::get(), // deviceInfo string - AppDemo::calibFilePath); // for calibrations made - - //////////////////////////////////////////////////// - slCreateAppAndScene(*cmdLineArgs, - devicePath + "/data/", - devicePath + "/data/shaders/", - devicePath + "/data/models/", - devicePath + "/data/images/textures/", - devicePath + "/data/images/fonts/", - devicePath + "/data/videos/", - devicePath + "/", - "AppDemoAndroid", - (void*)appDemoLoadScene); - - //////////////////////////////////////////////////////////////////// - svIndex = slCreateSceneView(AppDemo::assetManager, - AppDemo::scene, - (int)width, - (int)height, - (int)dpi, - SID_Revolver, - (void*)&Java_renderRaytracingCallback, - 0, - (void*)createAppDemoSceneView, - (void*)AppDemoGui::build, - (void*)AppDemoGui::loadConfig, - (void*)AppDemoGui::saveConfig); - //////////////////////////////////////////////////////////////////// - - delete cmdLineArgs; -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTerminate(JNIEnv* env, jclass obj) -{ - slTerminate(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onUpdateVideo(JNIEnv* env, jclass obj) -{ - return onUpdateVideo(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onUpdateParallelJob(JNIEnv* env, jclass obj) -{ - return slUpdateParallelJob(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onPaintAllViews(JNIEnv* env, jclass obj) -{ - return slPaintAllViews(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onResize(JNIEnv* env, jclass obj, jint width, jint height) -{ - slResize(svIndex, width, height); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseDown(JNIEnv* env, jclass obj, jint button, jint x, jint y) -{ - slMouseDown(svIndex, (SLMouseButton)button, x, y, K_none); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseUp(JNIEnv* env, jclass obj, jint button, jint x, jint y) -{ - slMouseUp(svIndex, (SLMouseButton)button, x, y, K_none); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onMouseMove(JNIEnv* env, jclass obj, jint x, jint y) -{ - slMouseMove(svIndex, x, y); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Down(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2) -{ - slTouch2Down(svIndex, x1, y1, x2, y2); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Move(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2) -{ - slTouch2Move(svIndex, x1, y1, x2, y2); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onTouch2Up(JNIEnv* env, jclass obj, jint x1, jint y1, jint x2, jint y2) -{ - slTouch2Up(svIndex, x1, y1, x2, y2); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onDoubleClick(JNIEnv* env, jclass obj, jint button, jint x, jint y) -{ - slDoubleClick(svIndex, MB_left, x, y, K_none); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onRotationQUAT(JNIEnv* env, jclass obj, jfloat quatX, jfloat quatY, jfloat quatZ, jfloat quatW) -{ - slRotationQUAT(quatX, quatY, quatZ, quatW); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onClose(JNIEnv* env, jclass obj) -{ - SL_LOG("onClose"); - slTerminate(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_usesRotation(JNIEnv* env, jclass obj) -{ - return slUsesRotation(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT - jint JNICALL - Java_ch_bfh_cpvrlab_GLES3Lib_getVideoType(JNIEnv* env, jclass obj) -{ - return (int)CVCapture::instance()->videoType(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT - jint JNICALL - Java_ch_bfh_cpvrlab_GLES3Lib_getVideoSizeIndex(JNIEnv* env, jclass obj) -{ - return CVCapture::instance()->activeCamera->camSizeIndex(); -} -//----------------------------------------------------------------------------- -//! Grabs a frame from a video file using OpenCV -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_grabVideoFileFrame(JNIEnv* env, jclass obj) -{ - SLSceneView* sv = AppDemo::sceneViews[0]; - CVCapture* capture = CVCapture::instance(); - - // Get the current capture size of the videofile - CVSize videoSizeBefore = capture->captureSize; - - // If viewportWdivH is negative the viewport aspect will be adapted to the video - // aspect ratio. No cropping will be applied. - // Android doesn't know the video file frame size before grab - float viewportWdivH = sv->viewportWdivH(); - if (sv->viewportSameAsVideo()) - viewportWdivH = -1; - - capture->grabAndAdjustForSL(viewportWdivH); - - // If video aspect has changed we need to tell the new viewport to the sceneview - CVSize videoSizeAfter = capture->captureSize; - if (sv->viewportSameAsVideo() && videoSizeBefore != videoSizeAfter) - sv->setViewportFromRatio(SLVec2i(videoSizeAfter.width, videoSizeAfter.height), - sv->viewportAlign(), - sv->viewportSameAsVideo()); -} -//----------------------------------------------------------------------------- -//! Copies the video image data to the CVCapture class -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_copyVideoImage(JNIEnv* env, jclass obj, jint imgWidth, jint imgHeight, jbyteArray imgBuffer) -{ - SLuchar* srcLumaPtr = reinterpret_cast(env->GetByteArrayElements(imgBuffer, 0)); - - if (srcLumaPtr == nullptr) - SL_EXIT_MSG("copyVideoImage: No image data pointer passed!"); - - SLSceneView* sv = AppDemo::sceneViews[0]; - CVCapture* capture = CVCapture::instance(); - float videoImgWdivH = (float)imgWidth / (float)imgHeight; - - if (sv->viewportSameAsVideo()) - { - // If video aspect has changed we need to tell the new viewport to the sceneview - if (Utils::abs(videoImgWdivH - sv->viewportWdivH()) > 0.01f) - sv->setViewportFromRatio(SLVec2i(imgWidth, imgHeight), sv->viewportAlign(), true); - } - - capture->loadIntoLastFrame(sv->viewportWdivH(), imgWidth, imgHeight, PF_yuv_420_888, srcLumaPtr, true); -} -//----------------------------------------------------------------------------- -//! This function is not in use and was an attempt to copy the data faster. -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_copyVideoYUVPlanes(JNIEnv* env, jclass obj, jint srcW, jint srcH, jbyteArray yBuf, jint ySize, jint yPixStride, jint yLineStride, jbyteArray uBuf, jint uSize, jint uPixStride, jint uLineStride, jbyteArray vBuf, jint vSize, jint vPixStride, jint vLineStride) -{ - // Cast jbyteArray to unsigned char pointer - SLuchar* y = reinterpret_cast(env->GetByteArrayElements(yBuf, 0)); - SLuchar* u = reinterpret_cast(env->GetByteArrayElements(uBuf, 0)); - SLuchar* v = reinterpret_cast(env->GetByteArrayElements(vBuf, 0)); - - if (y == nullptr) SL_EXIT_MSG("copyVideoYUVPlanes: No pointer for y-buffer passed!"); - if (u == nullptr) SL_EXIT_MSG("copyVideoYUVPlanes: No pointer for u-buffer passed!"); - if (v == nullptr) SL_EXIT_MSG("copyVideoYUVPlanes: No pointer for v-buffer passed!"); - - // If viewportWdivH is negative the viewport aspect will be adapted to the video - // aspect ratio. No cropping will be applied. - float viewportWdivH = AppDemo::sceneViews[0]->viewportWdivH(); - if (AppDemo::sceneViews[0]->viewportSameAsVideo()) - viewportWdivH = -1; - - CVCapture::instance()->copyYUVPlanes(viewportWdivH, srcW, srcH, y, ySize, yPixStride, yLineStride, u, uSize, uPixStride, uLineStride, v, vSize, vPixStride, vLineStride); -} -//----------------------------------------------------------------------------- -//! Copies the GPS information to the SLApplicaiton class -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onLocationLatLonAlt(JNIEnv* env, - jclass obj, - jdouble latitudeDEG, - jdouble longitudeDEG, - jdouble altitudeM, - jfloat accuracyM) -{ - slLocationLatLonAlt(latitudeDEG, longitudeDEG, altitudeM, accuracyM); -} -//----------------------------------------------------------------------------- -//! Asks the SLApplicaiton class if the GPS sensor data is requested -extern "C" JNIEXPORT bool JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_usesLocation(JNIEnv* env, jclass obj) -{ - return slUsesLocation(); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_onSetupExternalDir(JNIEnv* env, - jclass obj, - jstring externalDirPath) -{ - std::string externalDirPathNative = jstring2stdstring(env, externalDirPath); - slSetupExternalDir(externalDirPathNative); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_setCameraSize(JNIEnv* env, - jclass obj, - jint sizeIndex, - jint sizeIndexMax, - jint width, - jint height) -{ - CVCapture::instance()->setCameraSize(sizeIndex, sizeIndexMax, width, height); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_setDeviceParameter(JNIEnv* env, - jclass obj, - jstring parameter, - jstring value) -{ - std::string par = jstring2stdstring(env, parameter); - std::string val = jstring2stdstring(env, value); - slSetDeviceParameter(par.c_str(), val.c_str()); -} -//----------------------------------------------------------------------------- -extern "C" JNIEXPORT void JNICALL Java_ch_bfh_cpvrlab_GLES3Lib_initMediaPipeAssetManager(JNIEnv* env, - jclass obj, - jobject assetManager, - jstring cacheDirPath) -{ - mp_init_asset_manager(env, assetManager, cacheDirPath); -} \ No newline at end of file diff --git a/apps/app_demo_slproject/android/slproject-release-key b/apps/app_demo_slproject/android/slproject-release-key deleted file mode 100644 index 79046abc..00000000 Binary files a/apps/app_demo_slproject/android/slproject-release-key and /dev/null differ diff --git a/apps/app_demo_slproject/emscripten/AppDemoMainEmscripten.cpp b/apps/app_demo_slproject/emscripten/AppDemoMainEmscripten.cpp deleted file mode 100644 index 5cedbba2..00000000 --- a/apps/app_demo_slproject/emscripten/AppDemoMainEmscripten.cpp +++ /dev/null @@ -1,612 +0,0 @@ -// ############################################################################# -// File: AppDemoMainEmscripten.cpp -// Purpose: Application that demonstrates most features of the SLProject -// framework with WebGL, WebAssembly and Emscripten in a web -// browser. Implementation of the GUI is done with the emscripten -// framework. -// Date: October 2022 -// Codestyle: https://github.com/cpvrlab/SLProject/wiki/SLProject-Coding-Style -// Authors: Marino von Wattenwyl -// License: This software is provided under the GNU General Public License -// Please visit: http://opensource.org/licenses/GPL-3.0 -// ############################################################################# - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -static int canvasWidth; -static int canvasHeight; - -static int lastTouchDownX; -static int lastTouchDownY; -static double lastTouchDownTimeMS; - -static GLFWwindow* window; //!< The global glfw window handle -static SLint svIndex; //!< SceneView index -static SLint scrWidth; //!< Window width at start up -static SLint scrHeight; //!< Window height at start up -static SLbool fixAspectRatio; //!< Flag if wnd aspect ratio should be fixed -static SLfloat scrWdivH; //!< aspect ratio screen width divided by height -static SLint dpi = 142; //!< Dot per inch resolution of screen -static SLint startX; //!< start position x in pixels -static SLint startY; //!< start position y in pixels -static SLint mouseX; //!< Last mouse position x in pixels -static SLint mouseY; //!< Last mouse position y in pixels -static SLVec2i touch2; //!< Last finger touch 2 position in pixels -static SLVec2i touchDelta; //!< Delta between two fingers in x -static SLint lastWidth; //!< Last window width in pixels -static SLint lastHeight; //!< Last window height in pixels -static SLbool fullscreen = false; //!< flag if window is in fullscreen mode - -//----------------------------------------------------------------------------- -extern void appDemoLoadScene(SLAssetManager* am, - SLScene* s, - SLSceneView* sv, - SLSceneID sceneID); -extern bool onUpdateVideo(); -//----------------------------------------------------------------------------- -void updateCanvas() -{ - // clang-format off - EM_ASM({ - let canvas = Module['canvas']; - canvas.width = $0; - canvas.height = $1; - }, canvasWidth, canvasHeight); - // clang-format on -} -//----------------------------------------------------------------------------- -SLKey mapKeyToSLKey(unsigned long key) -{ - switch (key) - { - case 8: return K_backspace; - case 9: return K_tab; - case 13: return K_enter; - case 16: return K_shift; - case 17: return K_ctrl; - case 18: return K_alt; - case 27: return K_esc; - case 32: return K_space; - case 33: return K_pageUp; - case 34: return K_pageDown; - case 35: return K_end; - case 36: return K_home; - case 37: return K_left; - case 38: return K_up; - case 39: return K_right; - case 40: return K_down; - case 45: return K_insert; - case 46: return K_delete; - case 96: return K_NP0; - case 97: return K_NP1; - case 98: return K_NP2; - case 99: return K_NP3; - case 100: return K_NP4; - case 101: return K_NP5; - case 102: return K_NP6; - case 103: return K_NP7; - case 104: return K_NP8; - case 105: return K_NP9; - case 106: return K_NPMultiply; - case 107: return K_NPAdd; - case 109: return K_NPSubtract; - case 110: return K_NPDecimal; - case 111: return K_NPDivide; - case 112: return K_F1; - case 113: return K_F2; - case 114: return K_F3; - case 115: return K_F4; - case 116: return K_F5; - case 117: return K_F6; - case 118: return K_F7; - case 119: return K_F8; - case 120: return K_F9; - case 121: return K_F10; - case 122: return K_F11; - case 123: return K_F12; - default: return (SLKey)key; - } -} -//----------------------------------------------------------------------------- -SLKey mapModifiersToSLModifiers(bool shiftDown, bool ctrlDown, bool altDown) -{ - int modifiers = 0; - if (shiftDown) modifiers |= K_shift; - if (ctrlDown) modifiers |= K_ctrl; - if (altDown) modifiers |= K_alt; - return (SLKey)modifiers; -} -//----------------------------------------------------------------------------- -SLKey mapModifiersToSLModifiers(const EmscriptenMouseEvent* mouseEvent) -{ - return mapModifiersToSLModifiers(mouseEvent->shiftKey, - mouseEvent->ctrlKey, - mouseEvent->altKey); -} -//----------------------------------------------------------------------------- -SLKey mapModifiersToSLModifiers(const EmscriptenKeyboardEvent* keyEvent) -{ - return mapModifiersToSLModifiers(keyEvent->shiftKey, - keyEvent->ctrlKey, - keyEvent->altKey); -} -//----------------------------------------------------------------------------- -EMSCRIPTEN_RESULT emOnMousePressed(int eventType, - const EmscriptenMouseEvent* mouseEvent, - void* userData) -{ - SLint x = mouseX; - SLint y = mouseY; - SLKey modifiers = mapModifiersToSLModifiers(mouseEvent); - - startX = x; - startY = y; - - switch (mouseEvent->button) - { - case 0: - if (modifiers & K_alt && modifiers & K_ctrl) - slTouch2Down(svIndex, x - 20, y, x + 20, y); - else - slMouseDown(svIndex, - MB_left, - x, - y, - modifiers); - break; - case 1: - slMouseDown(svIndex, - MB_middle, - x, - y, - modifiers); - break; - case 2: - slMouseDown(svIndex, - MB_right, - x, - y, - modifiers); - break; - default: break; - } - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnMouseReleased(int eventType, - const EmscriptenMouseEvent* mouseEvent, - void* userData) -{ - SLint x = mouseX; - SLint y = mouseY; - SLKey modifiers = mapModifiersToSLModifiers(mouseEvent); - - startX = -1; - startY = -1; - - switch (mouseEvent->button) - { - case 0: - slMouseUp(svIndex, - MB_left, - x, - y, - modifiers); - break; - case 1: - slMouseUp(svIndex, - MB_middle, - x, - y, - modifiers); - break; - case 2: - slMouseUp(svIndex, - MB_right, - x, - y, - modifiers); - break; - default: break; - } - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnMouseDoubleClicked(int eventType, - const EmscriptenMouseEvent* mouseEvent, - void* userData) -{ - SLint x = mouseX; - SLint y = mouseY; - SLKey modifiers = mapModifiersToSLModifiers(mouseEvent); - - switch (mouseEvent->button) - { - case GLFW_MOUSE_BUTTON_LEFT: - slDoubleClick(svIndex, - MB_left, - x, - y, - modifiers); - break; - case GLFW_MOUSE_BUTTON_RIGHT: - slDoubleClick(svIndex, - MB_right, - x, - y, - modifiers); - break; - case GLFW_MOUSE_BUTTON_MIDDLE: - slDoubleClick(svIndex, - MB_middle, - x, - y, - modifiers); - break; - default: break; - } - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnMouseMove(int eventType, - const EmscriptenMouseEvent* mouseEvent, - void* userData) -{ - mouseX = (int)mouseEvent->targetX; - mouseY = (int)mouseEvent->targetY; - - if (mouseEvent->altKey && mouseEvent->ctrlKey) - slTouch2Move(svIndex, - mouseX - 20, - mouseY, - mouseX + 20, - mouseY); - else - slMouseMove(svIndex, - mouseX, - mouseY); - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnMouseWheel(int eventType, - const EmscriptenWheelEvent* wheelEvent, - void* userData) -{ - // Invert the sign because the scroll value is inverted - double deltaY = -wheelEvent->deltaY; - - // Make sure the delta is at least one integer - if (std::abs(deltaY) < 1) deltaY = Utils::sign(wheelEvent->deltaY); - - SLKey modifiers = mapModifiersToSLModifiers(&wheelEvent->mouse); - slMouseWheel(svIndex, (int)deltaY, modifiers); - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnKeyPressed(int eventType, - const EmscriptenKeyboardEvent* keyEvent, - void* userData) -{ - if (keyEvent->repeat) - return EM_TRUE; - - SLKey key = mapKeyToSLKey(keyEvent->keyCode); - SLKey modifiers = mapModifiersToSLModifiers(keyEvent); - - if (modifiers & K_alt && modifiers & K_shift) - { - SLSceneView* sv = AppDemo::sceneViews[0]; - - if (key == '0' && sv) - { - appDemoLoadScene(AppDemo::assetManager, - AppDemo::scene, - sv, - SID_Empty); - SL_LOG("Loading SceneID: %d", AppDemo::sceneID); - } - else if (key == K_left && sv && AppDemo::sceneID > 0) - { - appDemoLoadScene(AppDemo::assetManager, - AppDemo::scene, - sv, - (SLSceneID)(AppDemo::sceneID - 1)); - SL_LOG("Loading SceneID: %d", AppDemo::sceneID); - } - else if (key == K_right && sv && AppDemo::sceneID < SID_MaxNoBenchmarks - 1) - { - appDemoLoadScene(AppDemo::assetManager, - AppDemo::scene, - sv, - (SLSceneID)(AppDemo::sceneID + 1)); - SL_LOG("Loading SceneID: %d", AppDemo::sceneID); - } - } - - slKeyPress(svIndex, key, modifiers); - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnKeyReleased(int eventType, - const EmscriptenKeyboardEvent* keyEvent, - void* userData) -{ - SLKey key = mapKeyToSLKey(keyEvent->keyCode); - SLKey modifiers = mapModifiersToSLModifiers(keyEvent); - slKeyRelease(svIndex, key, modifiers); - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnTouchStart(int eventType, - const EmscriptenTouchEvent* touchEvent, - void* userData) -{ - if (touchEvent->numTouches == 1) - { - mouseX = (int)touchEvent->touches[0].clientX; - mouseY = (int)touchEvent->touches[0].clientY; - slMouseDown(svIndex, - MB_left, - mouseX, - mouseY, - K_none); - lastTouchDownTimeMS = touchEvent->timestamp; - } - else if (touchEvent->numTouches == 2) - { - int x0 = (int)touchEvent->touches[0].clientX; - int y0 = (int)touchEvent->touches[0].clientY; - int x1 = (int)touchEvent->touches[1].clientX; - int y1 = (int)touchEvent->touches[1].clientY; - slTouch2Down(svIndex, x0, y0, x1, y1); - } - - lastTouchDownX = mouseX; - lastTouchDownY = mouseY; - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnTouchEnd(int eventType, - const EmscriptenTouchEvent* touchEvent, - void* userData) -{ - if (touchEvent->numTouches == 1) - { - mouseX = (int)touchEvent->touches[0].clientX; - mouseY = (int)touchEvent->touches[0].clientY; - slMouseUp(svIndex, - MB_left, - mouseX, - mouseY, - K_none); - - int dx = std::abs(mouseX - lastTouchDownX); - int dy = std::abs(mouseY - lastTouchDownY); - double dt = touchEvent->timestamp - lastTouchDownTimeMS; - - if (dt > 800 && dx < 15 && dy < 15) - { - slMouseDown(svIndex, - MB_right, - lastTouchDownX, - lastTouchDownY, - K_none); - slMouseUp(svIndex, - MB_right, - lastTouchDownX, - lastTouchDownY, - K_none); - } - } - else if (touchEvent->numTouches == 2) - { - int x0 = (int)touchEvent->touches[0].clientX; - int y0 = (int)touchEvent->touches[0].clientY; - int x1 = (int)touchEvent->touches[1].clientX; - int y1 = (int)touchEvent->touches[1].clientY; - slTouch2Up(svIndex, x0, y0, x1, y1); - } - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -EM_BOOL emOnTouchMove(int eventType, - const EmscriptenTouchEvent* touchEvent, - void* userData) -{ - if (touchEvent->numTouches == 1) - { - mouseX = (int)touchEvent->touches[0].clientX; - mouseY = (int)touchEvent->touches[0].clientY; - slMouseMove(svIndex, mouseX, mouseY); - } - else if (touchEvent->numTouches == 2) - { - int x0 = (int)touchEvent->touches[0].clientX; - int y0 = (int)touchEvent->touches[0].clientY; - int x1 = (int)touchEvent->touches[1].clientX; - int y1 = (int)touchEvent->touches[1].clientY; - slTouch2Move(svIndex, x0, y0, x1, y1); - } - - return EM_TRUE; -} -//----------------------------------------------------------------------------- -const char* emOnUnload(int eventType, - const void* reserved, - void* userData) -{ - slTerminate(); - return nullptr; -} -//----------------------------------------------------------------------------- -SLSceneView* createAppDemoSceneView(SLScene* scene, - int curDPI, - SLInputManager& inputManager) -{ - return (SLSceneView*)new AppDemoSceneView(scene, - curDPI, - inputManager); -} -//----------------------------------------------------------------------------- -bool onPaint() -{ - if (AppDemo::sceneViews.empty()) - return false; - SLSceneView* sv = AppDemo::sceneViews[svIndex]; - - int newCanvasWidth = MAIN_THREAD_EM_ASM_INT(return window.innerWidth;); - int newCanvasHeight = MAIN_THREAD_EM_ASM_INT(return window.innerHeight;); - - if (newCanvasWidth != canvasWidth || newCanvasHeight != canvasHeight) - { - canvasWidth = newCanvasWidth; - canvasHeight = newCanvasHeight; - updateCanvas(); - - if (!AppDemo::sceneViews.empty()) - slResize(svIndex, - canvasWidth, - canvasHeight); - } - - // If live video image is requested grab it and copy it - if (CVCapture::instance()->videoType() != VT_NONE) - { - float viewportWdivH = sv->viewportWdivH(); - CVCapture::instance()->grabAndAdjustForSL(viewportWdivH); - } - - /////////////////////////////////////////////// - onUpdateVideo(); - bool jobIsRunning = slUpdateParallelJob(); - bool viewsNeedsRepaint = slPaintAllViews(); - /////////////////////////////////////////////// - - return jobIsRunning || viewsNeedsRepaint; -} -//----------------------------------------------------------------------------- -void onLoop() -{ - onPaint(); -} -//----------------------------------------------------------------------------- -int main(void) -{ - canvasWidth = MAIN_THREAD_EM_ASM_INT(return window.innerWidth); - canvasHeight = MAIN_THREAD_EM_ASM_INT(return window.innerHeight); - updateCanvas(); - - EmscriptenWebGLContextAttributes attributes; - emscripten_webgl_init_context_attributes(&attributes); - attributes.enableExtensionsByDefault = true; - attributes.antialias = false; - attributes.depth = true; - attributes.stencil = true; - attributes.alpha = true; - attributes.majorVersion = 2; - attributes.minorVersion = 0; - attributes.preserveDrawingBuffer = true; - - auto context = emscripten_webgl_create_context("#canvas", &attributes); - if (context > 0) - SL_LOG("WebGL context created."); - else - SL_EXIT_MSG("Failed to create WebGL context."); - - EMSCRIPTEN_RESULT result = emscripten_webgl_make_context_current(context); - if (result == EMSCRIPTEN_RESULT_SUCCESS) - SL_LOG("WebGL context made current."); - else - SL_EXIT_MSG("Failed to make WebGL context current."); - - emscripten_set_mousedown_callback("#canvas", nullptr, false, emOnMousePressed); - emscripten_set_mouseup_callback("#canvas", nullptr, false, emOnMouseReleased); - emscripten_set_dblclick_callback("#canvas", nullptr, false, emOnMouseDoubleClicked); - emscripten_set_mousemove_callback("#canvas", nullptr, false, emOnMouseMove); - emscripten_set_wheel_callback("#canvas", nullptr, false, emOnMouseWheel); - emscripten_set_keydown_callback(EMSCRIPTEN_EVENT_TARGET_DOCUMENT, nullptr, false, emOnKeyPressed); - emscripten_set_keyup_callback(EMSCRIPTEN_EVENT_TARGET_DOCUMENT, nullptr, false, emOnKeyReleased); - emscripten_set_touchstart_callback("#canvas", nullptr, false, emOnTouchStart); - emscripten_set_touchend_callback("#canvas", nullptr, false, emOnTouchEnd); - emscripten_set_touchmove_callback("#canvas", nullptr, false, emOnTouchMove); - emscripten_set_beforeunload_callback(nullptr, emOnUnload); - - // HACK: Fixes to make this able to run in an +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleImageBasedLighting.md b/docs/pages/ExampleImageBasedLighting.md index 4d879e5a..c58b6d90 100644 --- a/docs/pages/ExampleImageBasedLighting.md +++ b/docs/pages/ExampleImageBasedLighting.md @@ -1,5 +1,4 @@ \page example-ibl Image Based Lighting - The following scene shows 7 x 7 spheres with different metallic (vertical) and roughness (horizontal) material parameters. These parameters can be passed either by float values or as textures per pixel. @@ -9,7 +8,9 @@ The ambient part is added by the HDRI environment map shown around the scene as The shaders for this type of SLMaterial are generated automatically in SLGLProgramGenerated. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleLevelOfDetail.md b/docs/pages/ExampleLevelOfDetail.md index 16ca5f2b..82629ae0 100644 --- a/docs/pages/ExampleLevelOfDetail.md +++ b/docs/pages/ExampleLevelOfDetail.md @@ -1,5 +1,4 @@ \page example-lod Level of Detail - The following scene shows 2500 corinthian columns with each 83k triangles in their highest resolution. With 3 levels of detail the amount of geometry is reduced depending on the size of the bounding rectangle in screen space. In addition, we do automatically a view frustum culling to optimize rendering performance. @@ -8,7 +7,10 @@ The scene uses also cascaded shadow mapping with the current time im Switzerland You can adjust the date and time with the dialogue *Info > Date-Time*. \htmlonly - + +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleNodeAnimations.md b/docs/pages/ExampleNodeAnimations.md index c4c27d6a..e0a477eb 100644 --- a/docs/pages/ExampleNodeAnimations.md +++ b/docs/pages/ExampleNodeAnimations.md @@ -5,7 +5,9 @@ SLAnimation instances are created and controlled by the SLAnimManager. Its interface is in the menu *Animation*. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleParticles.md b/docs/pages/ExampleParticles.md index df280a1c..ff39749d 100644 --- a/docs/pages/ExampleParticles.md +++ b/docs/pages/ExampleParticles.md @@ -8,7 +8,9 @@ for the flames. The center fire uses one PS for the glow, one for the flames, on and one for the white smoke. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExamplePathtracing.md b/docs/pages/ExamplePathtracing.md index 00ee2bfb..29095803 100644 --- a/docs/pages/ExamplePathtracing.md +++ b/docs/pages/ExamplePathtracing.md @@ -1,5 +1,4 @@ \page example-pathtracing Path Tracing - Alternatively you can render a scene with path tracing using the menu *Renderer > Path Tracing*. The path tracer is optimized for the Blinn-Phong reflection model extended with reflection and refraction coefficient. By default, the resolution 0.5 times the window resolution. You can adjust the path tracing setting over the menu *PT*. @@ -8,7 +7,9 @@ The path tracer is multithreaded and uses a combination bounding boxes and regul The path tracer is implemented in the classes SLPathtracer and SLRay. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleRaytracing.md b/docs/pages/ExampleRaytracing.md index b972e98a..97ebd2e1 100644 --- a/docs/pages/ExampleRaytracing.md +++ b/docs/pages/ExampleRaytracing.md @@ -7,7 +7,9 @@ The ray tracer is multithreaded and uses a combination bounding boxes and regula The ray tracer is implemented in the classes SLRaytracer and SLRay. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleRevolver.md b/docs/pages/ExampleRevolver.md index ed3c1b4f..aa1c23d8 100644 --- a/docs/pages/ExampleRevolver.md +++ b/docs/pages/ExampleRevolver.md @@ -1,10 +1,11 @@ \page example-revolver Revolving Meshes - Examples of revolving mesh objects constructed by rotating a 2D curve. The classes SLArrow, SLCone, SLCylinder, SLDisk, SLLens and SLSpheric inherit the SLRevolver class that inherits from SLMesh. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleShadowMapping.md b/docs/pages/ExampleShadowMapping.md index 3a87525c..40a3746d 100644 --- a/docs/pages/ExampleShadowMapping.md +++ b/docs/pages/ExampleShadowMapping.md @@ -1,4 +1,5 @@ \page example-shadow-mapping Shadow Mapping + In this scene you can see all light types: Directional, rectangular, spot and point light. All light types (SLLightDirect, SLLightRect, SLLightSpot) are derived from SLLight and can have a SLShadowMap. Only directional light (SLLightDirect) can create cascaded shadow maps. @@ -6,7 +7,9 @@ Double-click a light to see its shadow volume. *With RMB > Show Properties* you can see more properties under *Node Properties > Shadow mapping*. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleSkinnedAnimations.md b/docs/pages/ExampleSkinnedAnimations.md index c5271e5b..064180a1 100644 --- a/docs/pages/ExampleSkinnedAnimations.md +++ b/docs/pages/ExampleSkinnedAnimations.md @@ -1,9 +1,12 @@ \page example-skinned-animations Skinned Animations + Skinned skeleton animation are imported at the moment with the Collada file format (DAE). Skinned skeleton animation are controlled by the SLAnimManager. Its interface is in the menu *Animation*. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/ExampleVolumeRendering.md b/docs/pages/ExampleVolumeRendering.md index 8d5bbbc1..3b29f937 100644 --- a/docs/pages/ExampleVolumeRendering.md +++ b/docs/pages/ExampleVolumeRendering.md @@ -1,10 +1,13 @@ \page example-volume-rendering Volume Rendering + Volume Rendering of an MRI angiography dataset. In the material properties you can adjust the transfer function for color and transparency points that transform the MRI intensities into color and transparency. Volume rendering is realized with a special fragment shader that performs a ray cast through the MRI volume. \htmlonly - +Unfortunately, we can not embed externally hosted WebAssembly code on github.io.
+Please open example in new tab + \endhtmlonly General help: diff --git a/docs/pages/Introduction.md b/docs/pages/Introduction.md index 21493c63..1b3a7c61 100644 --- a/docs/pages/Introduction.md +++ b/docs/pages/Introduction.md @@ -1,141 +1,107 @@ \page introduction Introduction - +\htmlonly +\endhtmlonly \tableofcontents + \section overview Overview -

-There are 5 code sections in an SLProject application: -

    -
  • APP: Application code:
  • -
      -
    • OS dependent OpenGL context and window creation
    • -
    • UI Definition using ImGUI
    • -
    • Scene definition using SL
    • -
    • Video processing using CV
    • -
    -
  • SL: Scene Library code:
  • -
      -
    • Scene & Sceneview management
    • -
    • Scenegraph classes
    • -
    • 3D and 2D rendering
    • -
    • Animation
    • -
    -
  • CV: Computer Vision code:
  • -
      -
    • Video Capturing
    • -
    • Calibration
    • -
    • Tracking
    • -
    -
  • Utils: Utilities used by SL and CV
  • -
  • Externals: External dependencies
  • -
- -

-

+ +There are 5 code sections in SLProject: + +- **APP**: Application code + - OS-dependent OpenGL context and window creation + - UI definition using ImGUI + - Scene definition using SL + - Video processing using CV +- **SL**: Scene Library code + - Scene & scene view management + - Scenegraph classes + - 3D and 2D rendering + - Animation +- **CV**: Computer vision code: + - Video capturing + - Calibration + - Tracking +- **Utils**: Utilities used by SL and CV +- **Externals**: External dependencies + + + \section diagram Class Diagram -

+ The following class diagram gives you an overview of the major classes with its important attributes and methods: -

    -
  • The gray boxes on top contain the application code that depend on the OS and do the GUI, the scene assembly and the video processing.
  • -
  • The light blue classes are the central classes with the top-level instances of SLAssetManager and SLInputManager. The core classes for the scene are SLScene and SLSceneView.
  • -
  • The dark blue classes are the alternative renderers for ray tracing and path tracing.
  • -
  • The yellow classes define the materials that are responsible for the visual appearances of the mesh objects.
  • -
  • The green classes build the scene graph that defines the spacial structure of the visible objects.
  • -
  • The pink classes define a single triangulated mesh object.
  • -
  • The violet classes encapsulate all OpenGL vertex array object and buffer objects.
  • -
  • The red classes build the animation framework.
  • -
  • The orange classes encapsulate the video, image and AR tracking functionality using OpenCV. CV classes are independent from all SL classes. Only SLGLTexture uses CVImage for its texture images.
  • -
  • The red classes build the animation framework.
  • -
  • The white classes are low level classes for the math. Some of them are within the namespace Utils.
  • -
  • The gray boxes at the bottom are the external libraries that are used within the frame work.
  • -
- -

-

+ +- The **gray boxes** on top contain the application code that depend on the OS and do the GUI, the scene assembly and the video processing. +- The **light blue classes** are the central classes with the top-level instances of SLAssetManager and SLInputManager. The core classes for the scene are SLScene and SLSceneView. +- The **dark blue classes** are the alternative renderers for ray tracing and path tracing. +- The **yellow classes** define the materials that are responsible for the visual appearances of the mesh objects. +- The **green classes** build the scene graph that defines the spacial structure of the visible objects. +- The **pink classes** define a single triangulated mesh object. +- The **violet classes** encapsulate all OpenGL vertex array object and buffer objects. +- The **red classes** build the animation framework. +- The **orange classes** encapsulate the video, image and AR tracking functionality using OpenCV. CV classes are independent from all SL classes. Only SLGLTexture uses CVImage for its texture images. +- The **red classes** build the animation framework. +- The **white classes** are low level classes for the math. Some of them are within the namespace Utils. +- The **gray boxes** at the bottom are the external libraries that are used within the frame work. + + + \section app Application Code -

-The applications code (grey boxes at the top of the diagram) contains the code for the operating system, the scene definition with SLProject library (SL), the video processing using CV-classes and the UI with ImGUI. In all cases we have the most outer shell of the application that handles the window and the OpenGL context creation and passes the events to a thin C-function interface before it is handled by the C++-framework in the library lib-SLProject. The following OS' are supported and applications are provided for demonstration: -

    -
  • - Windows, Linux and Max OSX applications use the GLFW - C-library for the platform independent window and context creation. - GLFW is included in the SLProject repository. See the app-Demo-GLFW for demonstration. - For all demo apps (desktop and mobile) we use the - ImGUI library for the UI. - The UI for the demo apps is implemented in AppDemoGUI. ImGUI is also included in the repository. -
  • -
  • - The Android application starts in JAVA and passes the events with JNI (Java - Native Interface) to the C-interface. See the - wiki for build instructions and the app-Demo-Android for demonstration. -
  • -
  • - On Apple iOS Devices the application starts in ObjectiveC before it passes the - events to the C-interface. See the - wiki for build instructions and the app-Demo-iOS for demonstration. -
  • -
  • - You could in fact use any GUI library on any OS that can create OpenGL contexts. - Other alternatives could be e.g. - Qt, - freeglut, - FLTK, - wxWidgets, - Nana or - Juce. -
  • -
  • - SLInterface.h and SLInterface.cpp define the C-Interface of the SLProject library. - We use a C-interface because this type can be called from any higher level language. - The SLInterface talks only to the SLInputManager, SLScene and SLSceneView classes. -
  • -
-

+ +The application's code (grey boxes at the top of the diagram) contains the code for the operating system, the scene definition with SLProject library (SL), the video processing using CV-classes and the UI with ImGUI. In all cases we have the most outer shell of the application that handles the window and the OpenGL context creation and passes the events to a thin C-function interface before it is handled by the C++-framework in the library lib-SLProject. +For more information about application code please read the page about the [App Framework](https://cpvrlab.github.io/SLProject4/app-framework.html). +The following systems are supported and applications are provided for demonstration: + +- **Windows, Linux and macOS** applications use the [GLFW](http://www.glfw.org/) + C-library for the platform independent window and context creation. + GLFW is included in the SLProject repository. See the app-Demo-GLFW for demonstration. +- **Android** applications start in Java and pass the events using the JNI (Java + Native Interface) to the C-interface. See the [wiki for build instructions](https://github.com/cpvrlab/SLProject4/wiki/Build-for-Android) + and `apps/source/platforms/android/example_project` for the project used in demos. +- **Apple iOS** applications start in Objective-C before it passes the events to the C-interface. + See the [wiki for build instructions](https://github.com/cpvrlab/SLProject4/wiki/Build-on-MacOS-with-XCode-for-iOS) + and `apps/source/platforms/ios/example_project` for the project used in demos. +- **Web** applications use [Emscripten](https://cpvrlab.github.io/SLProject4/emscripten.html) to compile C++ code to + [WebAssembly](https://webassembly.org/). This allows applications to be served by a web server + and run inside the browser. + +For all demo apps (desktop and mobile) we use the [Dear ImGui](https://github.com/ocornut/imgui) library for the UI. +The UI for `app-demo` is implemented in AppDemoGui. Dear ImGui is also included in the repository. +You could in fact use any GUI library on any OS that can render using OpenGL. +Other alternatives could be e.g. [Qt](https://www.qt.io/), [freeglut](http://freeglut.sourceforge.net/), +[FLTK](http://www.fltk.org/index.php), [wxWidgets](http://www.wxwidgets.org/), +[Nana](http://www.nanapro.org/en-us/), or [Juce](http://www.juce.com/). + +SLInterface.h and SLInterface.cpp define the C-Interface of the SLProject library. +We use a C-interface because this type can be called from any higher level language. +The SLInterface talks only to the SLInputManager, SLScene and SLSceneView classes. + +For more information about SLProject's app framework, see [this page](#app-framework). \section central Central Classes -

+ The light blue classes form the center of the SLProject framework: -

    -
  • - SLAssetManager holds the expensive resources such as meshes, materials, textures and shader programs in vectors. -
  • -
  • - SLInputManager collects all user events from the mouse and keyboard as well as from additional input devices such as a LeapMotion or Kinect sensor. -
  • -
  • - SLScene is the top-level class of the framework that represents the scene with - its properties. The scene content is created in SLScene::onLoad. - It also holds one or more pointers to SLSceneView instances. -
  • -
  • - SLSceneView represents a dynamic real time 3D view onto the scene. - A scene can be shown in multiple sceneviews as demonstrated in the app-Viewer-Qt application. - A sceneview receives all events (keyboard, mouse etc.) from the GUI via the SLInputManager. -
  • -
-

+ +- SLAssetManager holds the expensive resources such as meshes, materials, textures and shader programs in vectors. +- SLInputManager collects all user events from the mouse and keyboard as well as from additional input devices such as a LeapMotion or Kinect sensor. +- SLScene is the top-level class of the framework that represents the scene with + its properties. The scene content is created in SLScene::onLoad. + It also holds one or more pointers to SLSceneView instances. +- SLSceneView represents a dynamic real time 3D view onto the scene. + A scene can be shown in multiple sceneviews as demonstrated in the app-Viewer-Qt application. + A scene view receives all events (keyboard, mouse etc.) from the GUI via the SLInputManager. \section node Scenegraph Classes -

+ SLNode is the major building block for the the scenegraph structure (green classes) and can have 0-N children nodes and 0-N triangle meshes. A node can be transformed (translated, rotated and scaled) in 3D-space. -

    -
  • - SLLightDirect, SLLightSpot and SLLightRect are from SLNode derived and define lights - that can be placed and directed in space. -
  • -
  • - SLNodeLOD implements the level of detail functionality. -
  • -
  • - SLCamera that defines the view to the scene. The scene can have multiple cameras - but only one can be active for the scene view. -
  • -
-

+ +- SLLightDirect, SLLightSpot and SLLightRect are from SLNode derived and define lights that can be placed and directed in space. +- SLNodeLOD implements the level of detail functionality. +- SLCamera that defines the view to the scene. The scene can have multiple cameras but only one can be active for the scene view. \section mesh Mesh Classes -

+ SLMesh is the base class for triangulated or wire framed meshes (pink classes). A mesh is rendered with a material defined in SLMaterial. A mesh has all the vertex attributes such as position, normals, texture coordinates. @@ -144,140 +110,81 @@ A mesh has an instance of SLGLVertexArray that does all the OpenGL drawing. This vertex array object (VAO) stores all attributes in either a float or half float vertex attribute buffer (SLGLVertexBuffer) that is generated in the memory of the GPU. The attribute data on the client side is not deleted because it is used for ray tracing. -

    -
  • - SLRevolver, SLSphere, SLCylinder, SLCone, SLBox, SLPolygon and SLRectangle are all - inheritants from SLMesh and represent the according platonic shapes. -
  • -
  • - SLAABBox and SLUniformGrid implement the space partitioning. - Every SLNode has an axis aligned AABB that is used for fast frustum culling and ray shooting. -
  • -
-

+ +- SLRevolver, SLSphere, SLCylinder, SLCone, SLBox, SLPolygon and SLRectangle are all inheritants from SLMesh and represent the according platonic shapes. +- SLAABBox and SLUniformGrid implement the space partitioning. Every SLNode has an axis aligned AABB that is used for fast frustum culling and ray shooting. \section vao VertexArray Classes -

+ SLGLVertexArray and SLGLVertexBuffer encapsulate all OpenGL buffer stuff and provides the core drawing functionality with OpenGL. -

\section material Material Classes -

+ SLMaterial is the core of the yellow classes that define the appearance of a mesh. A material can have one or more texture images and is rendered with a specific shader program written in the OpenGL shading language (GLSL). -

    -
  • - SLGLTexture defines a texture image with filtering parameters. -
  • -
  • - SLGLProgram defines a shader program with at least one vertex and one fragment shader program. -
  • -
  • - SLGLProgramGenerated implements the automatic GLSL shader generation based on the material parameters, the lights and if shadow mapping is used. -
  • -
  • - SLGLShader defines a vertex or fragment shader where the source code is read from a file. -
  • -
  • - All OpenGL code is restricted to the classes beginning with SLGL. - (SLGLState, SLGLTexture, SLGLShader, SLGLProgram, SLGLVertexArray and SLGLVertexBuffer.) -
  • -
  • - The linear algebra math is implemented in the classes SLMat3, SLMat4, SLVec3, SLVec4 and SLQuat4. -
  • -
-

+ +- SLGLTexture defines a texture image with filtering parameters. +- SLGLProgram defines a shader program with at least one vertex and one fragment shader program. +- SLGLProgramGenerated implements the automatic GLSL shader generation based on the material parameters, the lights and if shadow mapping is used. +- SLGLShader defines a vertex or fragment shader where the source code is read from a file. +- All OpenGL code is restricted to the classes beginning with SLGL. (SLGLState, SLGLTexture, SLGLShader, SLGLProgram, SLGLVertexArray and SLGLVertexBuffer.) +- The linear algebra math is implemented in the classes SLMat3, SLMat4, SLVec3, SLVec4 and SLQuat4. \section animation Animation Classes -

+ The red animation classes provide the functionality for simple node animations or skeletal animations. -

    -
  • - SLAnimManager: A single instance of this class is held by the SLScene instance and is - responsible for updating the enabled animations and to manage their life time. - It keeps a list of all skeletons and node animations and also holds a list of - all animation playback controllers. The update of all animations is done before - the rendering of all SLSceneView instances. -
  • -
  • - SLAnimPlayback manages the playback state and the local time of an SLAnimation. - It manages the way the time advances and how the animation loops. It has all - functionality to play, pause, stop, enable, speed up and slow down a playback. - A list of all SLAnimPlayback instances is held by the SLAnimManager. -
  • -
  • - SLAnimation is a container for multiple SLAnimTrack that build an animation. - E.g. a walk animation would consist of all the SLAnimTrack that make a - SLSkeleton walk. It also knows the length of the animation. -
  • -
  • - SLAnimTrack and SLNodeAnimTrack: An animation track is a track that affects a - single SLNode or an SLJoint of an SLSkeleton by interpolating its transform. - It holds therefore a list of SLKeyframe. For a smooth motion it can interpolate - the transform at a given time between two neighbouring SLKeyframe. -
  • -
  • - SLKeyframe and SLTransformKeyframe define a transform at a certain time on an - SLAnimTrack. -
  • -
  • - SLSkeleton: A skeleton is used to animate a hierarchical object like a human figure. - An SLSkeleton keeps track of its bones (SLJoints) in a tree structure and - points with _root to the root node of the skeleton hierarchy. - An SLSkeleton is not actively transforming any SLNode in the scene graph. - It just keeps track of its transformed SLJoint. - A mesh that is associated with a skeleton transforms all its vertices every - frame by the joint weights (Jw). Every vertex of a mesh has weights for up to four joints - by which it can be influenced. -
  • -
  • - SLJoint is a specialised SLNode that represents a single joint (or bone) in a skeleton - The main addition of SLJoint to the base SLNode is the offset matrix which is the - inverse transformation of the joint's binding pose in mesh space. - It is used to transform the vertices of a rigged SLMesh to the origin of the joint - to be able to manipulate them in the joint's local space. -
  • -
-

+ +- SLAnimManager: A single instance of this class is held by the SLScene instance and is + responsible for updating the enabled animations and to manage their life time. + It keeps a list of all skeletons and node animations and also holds a list of + all animation playback controllers. The update of all animations is done before + the rendering of all SLSceneView instances. +- SLAnimPlayback manages the playback state and the local time of an SLAnimation. + It manages the way the time advances and how the animation loops. It has all + functionality to play, pause, stop, enable, speed up and slow down a playback. + A list of all SLAnimPlayback instances is held by the SLAnimManager. +- SLAnimation is a container for multiple SLAnimTrack that build an animation. + E.g. a walk animation would consist of all the SLAnimTrack that make a + SLSkeleton walk. It also knows the length of the animation. +- SLAnimTrack and SLNodeAnimTrack: An animation track is a track that affects a + single SLNode or an SLJoint of an SLSkeleton by interpolating its transform. + It holds therefore a list of SLKeyframe. For a smooth motion it can interpolate + the transform at a given time between two neighbouring SLKeyframe. +- SLKeyframe and SLTransformKeyframe define a transform at a certain time on an + SLAnimTrack. +- SLSkeleton: A skeleton is used to animate a hierarchical object like a human figure. + An SLSkeleton keeps track of its bones (SLJoints) in a tree structure and + points with _root to the root node of the skeleton hierarchy. + An SLSkeleton is not actively transforming any SLNode in the scene graph. + It just keeps track of its transformed SLJoint. + A mesh that is associated with a skeleton transforms all its vertices every + frame by the joint weights (Jw). Every vertex of a mesh has weights for up to four joints + by which it can be influenced. +- SLJoint is a specialised SLNode that represents a single joint (or bone) in a skeleton + The main addition of SLJoint to the base SLNode is the offset matrix which is the + inverse transformation of the joint's binding pose in mesh space. + It is used to transform the vertices of a rigged SLMesh to the origin of the joint + to be able to manipulate them in the joint's local space. \section imageprocessing Image and Video Processing Classes -

+ The orange classes provide the functionality for video and image processing using the OpenCV framework. The SLProject framework can now process the images from attached live video cameras. This works via OpenCV on desktop OS as well as on iOS and Android. The live video image is constantly fed into an OpenGL texture that can be used as a texture on an objects material or as the scenes background. With the live video in the background you can create augmented reality (AR) applications. Examples can be found in the demo application under Load Scene > Using Video > Track Chessboard or Track AruCo. -

    -
  • - CVImage: Replaces the deprecated SLImage class and provides all for loading, saving and converting images. Internally it stores the image in a cv::Mat instance. -
  • -
  • - CVCapture: Holds static images from the OpenCV video capture or from an external (iOS and Android) video capture service. There is an CVCapture::lastFrame and an CVCapture::lastFrameGray with the gray level version of the last capture video frame. -
  • -
  • - CVCalibration holds all functionality to calibrate the video camera. A classic chessboard pattern is used for calibration. In the demo application a special scene is provided for the calibration (Preferences > Video > Calibrate Camera). -
  • -
  • - CVTracked is the base class for tracking classes. The scene can have multiple trackers. - A tracker is associated with a Node. When the object to be tracked is found, it controls the nodes transform. If the associated node is the scenes active camera a classic augmented reality application can be generated. -
  • -
  • - CVTrackedChessboard tracks the same chessboard that is used for the camera calibration. -
  • -
  • - CVTrackedAruco tracks special markers called AruCo markers. These markers are optimal in tracking performance and stability. -
  • -
  • - CVTrackedFeatures tracks any 2D features. Supported are any feature detectors and descriptors that - are provided by OpenCV. In addition we include the enhanced ORB feature detector developed by - Raul Mur -
  • -
-

- -

-Authors: marcus.hudritsch@bfh.ch
-Date: January 2022
-Copyright (c): 2002-2022 Marcus Hudritsch, Kirchrain 18, 2572 Sutz, Switzerland -

+ +- CVImage: Replaces the deprecated SLImage class and provides all for loading, saving and converting images. Internally it stores the image in a cv::Mat instance. +- CVCapture: Holds static images from the OpenCV video capture or from an external (iOS and Android) video capture service. There is an CVCapture::lastFrame and an CVCapture::lastFrameGray with the gray level version of the last capture video frame. +- CVCalibration holds all functionality to calibrate the video camera. A classic chessboard pattern is used for calibration. In the demo application a special scene is provided for the calibration (Preferences > Video > Calibrate Camera). +- CVTracked is the base class for tracking classes. The scene can have multiple trackers. +- A tracker is associated with a Node. When the object to be tracked is found, it controls the nodes transform. If the associated node is the scenes active camera a classic augmented reality application can be generated. +- CVTrackedChessboard tracks the same chessboard that is used for the camera calibration. +- CVTrackedAruco tracks special markers called AruCo markers. These markers are optimal in tracking performance and stability. +- CVTrackedFeatures tracks any 2D features. Supported are any feature detectors and descriptors that are provided by OpenCV. + In addition we include the enhanced ORB feature detector developed by [Raul Mur](https://github.com/raulmur/ORB_SLAM2). + +Authors: marcus.hudritsch@bfh.ch \ +Date: Juli 2024 \ +Copyright (c): 2002-2024 Marcus Hudritsch, Kirchrain 18, 2572 Sutz, Switzerland diff --git a/docs/pages/SLProject.md b/docs/pages/SLProject.md index 7efa3e4a..d1774b0d 100644 --- a/docs/pages/SLProject.md +++ b/docs/pages/SLProject.md @@ -45,6 +45,7 @@ The code is provided without any warranties whether expressed or implied.

Read the \subpage introduction for an overview of the framework.
Read \subpage on-paint for an overview of how on frame gets rendered.
+Read \subpage app-framework for an overview of the app framework.
Read \subpage emscripten for an overview of how SLProject runs in the browser.

@@ -86,7 +87,7 @@ The framework uses beside OpenGL the following external libraries that are also

-Authors: marcus.hudritsch@bfh.ch
-Date: June 2023
+Author: marcus.hudritsch@bfh.ch
+Date: June 2024
Copyright (c): 2002-2023 Marcus Hudritsch, Kirchrain 18, 2572 Sutz, Switzerland

diff --git a/externals/eigen/CMakeLists.txt b/externals/eigen/CMakeLists.txt index f5840025..f3e69b84 100644 --- a/externals/eigen/CMakeLists.txt +++ b/externals/eigen/CMakeLists.txt @@ -1,6 +1,7 @@ -project(Eigen3) +# cmake_minimum_require must be the first command of the file +cmake_minimum_required(VERSION 3.5.0) -cmake_minimum_required(VERSION 2.8.5) +project(Eigen3) # guard against in-source builds @@ -8,6 +9,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ") endif() + # Alias Eigen_*_DIR to Eigen3_*_DIR: set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR}) @@ -19,16 +21,9 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() -string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower) -if( NOT cmake_build_type_tolower STREQUAL "debug" - AND NOT cmake_build_type_tolower STREQUAL "release" - AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo") - message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, RelWithDebInfo (case-insensitive).") -endif() - ############################################################################# -# retrieve version infomation # +# retrieve version information # ############################################################################# # automatically parse the version number @@ -41,29 +36,61 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_ set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}") set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}) -# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty, -# but won't stop CMake. -execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) -execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +# if we are not in a git clone +if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git) + # if the git program is absent or this will leave the EIGEN_GIT_REVNUM string empty, + # but won't stop CMake. + execute_process(COMMAND git ls-remote --refs -q ${CMAKE_SOURCE_DIR} HEAD OUTPUT_VARIABLE EIGEN_GIT_OUTPUT) +endif() -# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output... -if(EIGEN_BRANCH_OUTPUT MATCHES "default") -string(REGEX MATCH "^changeset: *[0-9]*:([0-9;a-f]+).*" EIGEN_HG_CHANGESET_MATCH "${EIGEN_HGTIP_OUTPUT}") -set(EIGEN_HG_CHANGESET "${CMAKE_MATCH_1}") -endif(EIGEN_BRANCH_OUTPUT MATCHES "default") +# extract the git rev number from the git output... +if(EIGEN_GIT_OUTPUT) +string(REGEX MATCH "^([0-9;a-f]+).*" EIGEN_GIT_CHANGESET_MATCH "${EIGEN_GIT_OUTPUT}") +set(EIGEN_GIT_REVNUM "${CMAKE_MATCH_1}") +endif() #...and show it next to the version number -if(EIGEN_HG_CHANGESET) - set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (mercurial changeset ${EIGEN_HG_CHANGESET})") -else(EIGEN_HG_CHANGESET) +if(EIGEN_GIT_REVNUM) + set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER} (git rev ${EIGEN_GIT_REVNUM})") +else() set(EIGEN_VERSION "${EIGEN_VERSION_NUMBER}") -endif(EIGEN_HG_CHANGESET) - +endif() include(CheckCXXCompilerFlag) include(GNUInstallDirs) +include(CMakeDependentOption) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) + +option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) + + +macro(ei_add_cxx_compiler_flag FLAG) + string(REGEX REPLACE "-" "" SFLAG1 ${FLAG}) + string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1}) + check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG}) + if(COMPILER_SUPPORT_${SFLAG}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}") + endif() +endmacro() + +check_cxx_compiler_flag("-std=c++11" EIGEN_COMPILER_SUPPORT_CPP11) + +if(EIGEN_TEST_CXX11) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_EXTENSIONS OFF) + if(EIGEN_COMPILER_SUPPORT_CPP11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif() +else() + #set(CMAKE_CXX_STANDARD 03) + #set(CMAKE_CXX_EXTENSIONS OFF) + ei_add_cxx_compiler_flag("-std=c++03") +endif() + +# Determine if we should build shared libraries on this platform. +get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS) + ############################################################################# # find how to link to the standard libraries # ############################################################################# @@ -104,7 +131,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows) option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON) endif() -set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_INCLUDE_CURRENT_DIR OFF) option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON) @@ -115,15 +142,6 @@ endif() set(EIGEN_TEST_MAX_SIZE "320" CACHE STRING "Maximal matrix/vector size, default is 320") -macro(ei_add_cxx_compiler_flag FLAG) - string(REGEX REPLACE "-" "" SFLAG1 ${FLAG}) - string(REGEX REPLACE "\\+" "p" SFLAG ${SFLAG1}) - check_cxx_compiler_flag(${FLAG} COMPILER_SUPPORT_${SFLAG}) - if(COMPILER_SUPPORT_${SFLAG}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG}") - endif() -endmacro(ei_add_cxx_compiler_flag) - if(NOT MSVC) # We assume that other compilers are partly compatible with GNUCC @@ -137,7 +155,7 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wall") ei_add_cxx_compiler_flag("-Wextra") #ei_add_cxx_compiler_flag("-Weverything") # clang - + ei_add_cxx_compiler_flag("-Wundef") ei_add_cxx_compiler_flag("-Wcast-align") ei_add_cxx_compiler_flag("-Wchar-subscripts") @@ -152,29 +170,25 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wc++11-extensions") ei_add_cxx_compiler_flag("-Wdouble-promotion") # ei_add_cxx_compiler_flag("-Wconversion") - - # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 - # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0")) - if(NOT CMAKE_COMPILER_IS_GNUCXX) - ei_add_cxx_compiler_flag("-Wshadow") - endif() - + + ei_add_cxx_compiler_flag("-Wshadow") + ei_add_cxx_compiler_flag("-Wno-psabi") ei_add_cxx_compiler_flag("-Wno-variadic-macros") ei_add_cxx_compiler_flag("-Wno-long-long") - + ei_add_cxx_compiler_flag("-fno-check-new") ei_add_cxx_compiler_flag("-fno-common") ei_add_cxx_compiler_flag("-fstrict-aliasing") ei_add_cxx_compiler_flag("-wd981") # disable ICC's "operands are evaluated in unspecified order" remark ei_add_cxx_compiler_flag("-wd2304") # disable ICC's "warning #2304: non-explicit constructor with single argument may cause implicit type conversion" produced by -Wnon-virtual-dtor - - + + # The -ansi flag must be added last, otherwise it is also used as a linker flag by check_cxx_compiler_flag making it fails # Moreover we should not set both -strict-ansi and -ansi check_cxx_compiler_flag("-strict-ansi" COMPILER_SUPPORT_STRICTANSI) ei_add_cxx_compiler_flag("-Qunused-arguments") # disable clang warning: argument unused during compilation: '-ansi' - + if(COMPILER_SUPPORT_STRICTANSI) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -strict-ansi") else() @@ -185,7 +199,7 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-pie") ei_add_cxx_compiler_flag("-fPIE") endif() - + set(CMAKE_REQUIRED_FLAGS "") option(EIGEN_TEST_SSE2 "Enable/Disable SSE2 in tests/examples" OFF) @@ -230,12 +244,30 @@ if(NOT MSVC) message(STATUS "Enabling FMA in tests/examples") endif() + option(EIGEN_TEST_AVX2 "Enable/Disable AVX2 in tests/examples" OFF) + if(EIGEN_TEST_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma") + message(STATUS "Enabling AVX2 in tests/examples") + endif() + option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma") + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") + endif() message(STATUS "Enabling AVX512 in tests/examples") endif() + option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) + if(EIGEN_TEST_AVX512DQ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq") + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") + endif() + message(STATUS "Enabling AVX512DQ in tests/examples") + endif() + option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF) if(EIGEN_TEST_F16C) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c") @@ -254,6 +286,12 @@ if(NOT MSVC) message(STATUS "Enabling VSX in tests/examples") endif() + option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF) + if(EIGEN_TEST_MSA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa") + message(STATUS "Enabling MSA in tests/examples") + endif() + option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) if(EIGEN_TEST_FMA) @@ -271,12 +309,18 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() - option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) - if(EIGEN_TEST_ZVECTOR) + option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z13) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") endif() + option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z14) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") + message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") + endif() + check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF) @@ -286,7 +330,7 @@ if(NOT MSVC) endif() endif() -else(NOT MSVC) +else() # C4127 - conditional expression is constant # C4714 - marked as __forceinline not inlined (I failed to deactivate it selectively) @@ -294,7 +338,7 @@ else(NOT MSVC) # because we are oftentimes returning objects that have a destructor or may # throw exceptions - in particular in the unit tests we are throwing extra many # exceptions to cover indexing errors. - # C4505 - unreferenced local function has been removed (impossible to deactive selectively) + # C4505 - unreferenced local function has been removed (impossible to deactivate selectively) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /wd4127 /wd4505 /wd4714") # replace all /Wx by /W4 @@ -314,10 +358,23 @@ else(NOT MSVC) if(NOT CMAKE_CL_64) # arch is not supported on 64 bit systems, SSE is enabled automatically. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:SSE2") - endif(NOT CMAKE_CL_64) + endif() message(STATUS "Enabling SSE2 in tests/examples") - endif(EIGEN_TEST_SSE2) -endif(NOT MSVC) + endif() + + option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF) + if(EIGEN_TEST_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") + message(STATUS "Enabling AVX in tests/examples") + endif() + + option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF) + if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + message(STATUS "Enabling FMA/AVX2 in tests/examples") + endif() + +endif() option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF) option(EIGEN_TEST_X87 "Force using X87 instructions. Implies no vectorization." OFF) @@ -359,11 +416,9 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tensor module)." OFF) - set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") -include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR if(EIGEN_INCLUDE_INSTALL_DIR) @@ -372,22 +427,28 @@ endif() if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR) set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} - CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed") + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed") else() set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/eigen3" - CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed" ) endif() set(CMAKEPACKAGE_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/eigen3/cmake" - CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed" ) set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig" - CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed" ) +foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR) + # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}". + if(IS_ABSOLUTE "${${var}}") + file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}") + endif() +endforeach() # similar to set_target_properties but append the property instead of overwriting it macro(ei_add_target_property target prop value) @@ -396,9 +457,9 @@ macro(ei_add_target_property target prop value) # if the property wasn't previously set, ${previous} is now "previous-NOTFOUND" which cmake allows catching with plain if() if(NOT previous) set(previous "") - endif(NOT previous) + endif() set_target_properties(${target} PROPERTIES ${prop} "${previous} ${value}") -endmacro(ei_add_target_property) +endmacro() install(FILES signature_of_eigen3_matrix_library @@ -412,20 +473,26 @@ if(EIGEN_BUILD_PKGCONFIG) ) endif() -add_subdirectory(Eigen) +install(DIRECTORY Eigen DESTINATION ${INCLUDE_INSTALL_DIR} COMPONENT Devel) -add_subdirectory(doc EXCLUDE_FROM_ALL) -include(EigenConfigureTesting) +option(EIGEN_BUILD_DOC "Enable creation of Eigen documentation" ON) +if(EIGEN_BUILD_DOC) + add_subdirectory(doc EXCLUDE_FROM_ALL) +endif() -# fixme, not sure this line is still needed: -enable_testing() # must be called from the root CMakeLists, see man page +option(BUILD_TESTING "Enable creation of Eigen tests." ON) +if(BUILD_TESTING) + include(EigenConfigureTesting) -if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest -else() - add_subdirectory(test EXCLUDE_FROM_ALL) + if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) + add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest + else() + add_subdirectory(test EXCLUDE_FROM_ALL) + endif() + + add_subdirectory(failtest) endif() if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -438,9 +505,32 @@ endif() # add SYCL option(EIGEN_TEST_SYCL "Add Sycl support." OFF) +option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF) if(EIGEN_TEST_SYCL) set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") - include(FindComputeCpp) + find_package(Threads REQUIRED) + if(EIGEN_SYCL_TRISYCL) + message(STATUS "Using triSYCL") + include(FindTriSYCL) + else() + message(STATUS "Using ComputeCPP SYCL") + include(FindComputeCpp) + set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF) + if (NOT MSVC) + set(COMPUTECPP_DRIVER_DEFAULT_VALUE ON) + endif() + option(COMPUTECPP_USE_COMPILER_DRIVER + "Use ComputeCpp driver instead of a 2 steps compilation" + ${COMPUTECPP_DRIVER_DEFAULT_VALUE} + ) + endif(EIGEN_SYCL_TRISYCL) + option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF) + if(EIGEN_DONT_VECTORIZE_SYCL) + message(STATUS "Disabling SYCL vectorization in tests/examples") + # When disabling SYCL vectorization, also disable Eigen default vectorization + add_definitions(-DEIGEN_DONT_VECTORIZE=1) + add_definitions(-DEIGEN_DONT_VECTORIZE_SYCL=1) + endif() endif() add_subdirectory(unsupported) @@ -453,51 +543,48 @@ add_subdirectory(scripts EXCLUDE_FROM_ALL) # TODO: consider also replacing EIGEN_BUILD_BTL by a custom target "make btl"? if(EIGEN_BUILD_BTL) add_subdirectory(bench/btl EXCLUDE_FROM_ALL) -endif(EIGEN_BUILD_BTL) +endif() if(NOT WIN32) add_subdirectory(bench/spbench EXCLUDE_FROM_ALL) -endif(NOT WIN32) +endif() configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY) -ei_testing_print_summary() +if(BUILD_TESTING) + ei_testing_print_summary() +endif() message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -option(EIGEN_FAILTEST "Enable failtests." OFF) -if(EIGEN_FAILTEST) - add_subdirectory(failtest) -endif() - string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) if(cmake_generator_tolower MATCHES "makefile") - message(STATUS "Some things you can do now:") - message(STATUS "--------------+--------------------------------------------------------------") - message(STATUS "Command | Description") - message(STATUS "--------------+--------------------------------------------------------------") - message(STATUS "make install | Install Eigen. Headers will be installed to:") - message(STATUS " | /") - message(STATUS " | Using the following values:") - message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") - message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") - message(STATUS " | Change the install location of Eigen headers using:") - message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") - message(STATUS " | Or:") - message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") - message(STATUS "make doc | Generate the API documentation, requires Doxygen & LaTeX") - message(STATUS "make check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") - message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)") - message(STATUS "make uninstall| Removes files installed by make install") - message(STATUS "--------------+--------------------------------------------------------------") + message(STATUS "Available targets (use: make TARGET):") else() - message(STATUS "To build/run the unit tests, read this page:") - message(STATUS " http://eigen.tuxfamily.org/index.php?title=Tests") + message(STATUS "Available targets (use: cmake --build . --target TARGET):") endif() - +message(STATUS "---------+--------------------------------------------------------------") +message(STATUS "Target | Description") +message(STATUS "---------+--------------------------------------------------------------") +message(STATUS "install | Install Eigen. Headers will be installed to:") +message(STATUS " | /") +message(STATUS " | Using the following values:") +message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") +message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") +message(STATUS " | Change the install location of Eigen headers using:") +message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") +message(STATUS " | Or:") +message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") +message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") +if(BUILD_TESTING) + message(STATUS "check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") +endif() +message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") +message(STATUS "uninstall| Remove files installed by the install target") +message(STATUS "---------+--------------------------------------------------------------") message(STATUS "") @@ -509,82 +596,48 @@ set ( EIGEN_DEFINITIONS "") set ( EIGEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${INCLUDE_INSTALL_DIR}" ) set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} ) -# Interface libraries require at least CMake 3.0 -if (NOT CMAKE_VERSION VERSION_LESS 3.0) - include (CMakePackageConfigHelpers) - - # Imported target support - add_library (eigen INTERFACE) - - target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) - target_include_directories (eigen INTERFACE - $ - $ - ) - - # Export as title case Eigen - set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen) - - install (TARGETS eigen EXPORT Eigen3Targets) - - configure_package_config_file ( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake - PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR - INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} - NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components - ) - # Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does - # not depend on architecture specific settings or libraries. More - # specifically, an Eigen3Config.cmake generated from a 64 bit target can be - # used for 32 bit targets as well (and vice versa). - set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) - unset (CMAKE_SIZEOF_VOID_P) - write_basic_package_version_file (Eigen3ConfigVersion.cmake - VERSION ${EIGEN_VERSION_NUMBER} - COMPATIBILITY SameMajorVersion) - set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P}) - - # The Eigen target will be located in the Eigen3 namespace. Other CMake - # targets can refer to it using Eigen3::Eigen. - export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake) - # Export Eigen3 package to CMake registry such that it can be easily found by - # CMake even if it has not been installed to a standard directory. - export (PACKAGE Eigen3) - - install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}) - -else (NOT CMAKE_VERSION VERSION_LESS 3.0) - # Fallback to legacy Eigen3Config.cmake without the imported target - - # If CMakePackageConfigHelpers module is available (CMake >= 2.8.8) - # create a relocatable Config file, otherwise leave the hardcoded paths - include(CMakePackageConfigHelpers OPTIONAL RESULT_VARIABLE CPCH_PATH) - - if(CPCH_PATH) - configure_package_config_file ( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake - PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR - INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} - NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components - ) - else() - # The PACKAGE_* variables are defined by the configure_package_config_file - # but without it we define them manually to the hardcoded paths - set(PACKAGE_INIT "") - set(PACKAGE_EIGEN_INCLUDE_DIR ${EIGEN_INCLUDE_DIR}) - set(PACKAGE_EIGEN_ROOT_DIR ${EIGEN_ROOT_DIR}) - configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3ConfigLegacy.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake - @ONLY ESCAPE_QUOTES ) - endif() - - write_basic_package_version_file( Eigen3ConfigVersion.cmake - VERSION ${EIGEN_VERSION_NUMBER} - COMPATIBILITY SameMajorVersion ) - -endif (NOT CMAKE_VERSION VERSION_LESS 3.0) +include (CMakePackageConfigHelpers) + +# Imported target support +add_library (eigen INTERFACE) +add_library (Eigen3::Eigen ALIAS eigen) +target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) +target_include_directories (eigen INTERFACE + $ + $ +) + +# Export as title case Eigen +set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen) + +install (TARGETS eigen EXPORT Eigen3Targets) + +configure_package_config_file ( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake + PATH_VARS EIGEN_INCLUDE_DIR EIGEN_ROOT_DIR + INSTALL_DESTINATION ${CMAKEPACKAGE_INSTALL_DIR} + NO_CHECK_REQUIRED_COMPONENTS_MACRO # Eigen does not provide components +) +# Remove CMAKE_SIZEOF_VOID_P from Eigen3ConfigVersion.cmake since Eigen does +# not depend on architecture specific settings or libraries. More +# specifically, an Eigen3Config.cmake generated from a 64 bit target can be +# used for 32 bit targets as well (and vice versa). +set (_Eigen3_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) +unset (CMAKE_SIZEOF_VOID_P) +write_basic_package_version_file (Eigen3ConfigVersion.cmake + VERSION ${EIGEN_VERSION_NUMBER} + COMPATIBILITY SameMajorVersion) +set (CMAKE_SIZEOF_VOID_P ${_Eigen3_CMAKE_SIZEOF_VOID_P}) + +# The Eigen target will be located in the Eigen3 namespace. Other CMake +# targets can refer to it using Eigen3::Eigen. +export (TARGETS eigen NAMESPACE Eigen3:: FILE Eigen3Targets.cmake) +# Export Eigen3 package to CMake registry such that it can be easily found by +# CMake even if it has not been installed to a standard directory. +export (PACKAGE Eigen3) + +install (EXPORT Eigen3Targets NAMESPACE Eigen3:: DESTINATION ${CMAKEPACKAGE_INSTALL_DIR}) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake @@ -594,3 +647,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake # Add uninstall target add_custom_target ( uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) + +if (EIGEN_SPLIT_TESTSUITE) + ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}") +endif() diff --git a/externals/eigen/COPYING.APACHE b/externals/eigen/COPYING.APACHE new file mode 100644 index 00000000..61e948d2 --- /dev/null +++ b/externals/eigen/COPYING.APACHE @@ -0,0 +1,203 @@ +/* + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ \ No newline at end of file diff --git a/externals/eigen/COPYING.BSD b/externals/eigen/COPYING.BSD new file mode 100644 index 00000000..8964ddfd --- /dev/null +++ b/externals/eigen/COPYING.BSD @@ -0,0 +1,26 @@ +/* + Copyright (c) 2011, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ diff --git a/externals/eigen/COPYING.GPL b/externals/eigen/COPYING.GPL new file mode 100644 index 00000000..94a9ed02 --- /dev/null +++ b/externals/eigen/COPYING.GPL @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/externals/eigen/COPYING.LGPL b/externals/eigen/COPYING.LGPL new file mode 100644 index 00000000..4362b491 --- /dev/null +++ b/externals/eigen/COPYING.LGPL @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/externals/eigen/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt b/externals/eigen/COPYING.MINPACK similarity index 99% rename from externals/eigen/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt rename to externals/eigen/COPYING.MINPACK index ae7984da..132cc3f3 100644 --- a/externals/eigen/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt +++ b/externals/eigen/COPYING.MINPACK @@ -49,4 +49,3 @@ SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT (INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE POSSIBILITY OF SUCH LOSS OR DAMAGES. - diff --git a/externals/eigen/COPYING.MPL2 b/externals/eigen/COPYING.MPL2 new file mode 100644 index 00000000..14e2f777 --- /dev/null +++ b/externals/eigen/COPYING.MPL2 @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/externals/eigen/COPYING.README b/externals/eigen/COPYING.README new file mode 100644 index 00000000..de5b6321 --- /dev/null +++ b/externals/eigen/COPYING.README @@ -0,0 +1,18 @@ +Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links: + http://www.mozilla.org/MPL/2.0/ + http://www.mozilla.org/MPL/2.0/FAQ.html + +Some files contain third-party code under BSD or LGPL licenses, whence the other +COPYING.* files here. + +All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later. +For this reason, the COPYING.LGPL file contains the LGPL 2.1 text. + +If you want to guarantee that the Eigen code that you are #including is licensed +under the MPL2 and possibly more permissive licenses (like BSD), #define this +preprocessor symbol: + EIGEN_MPL2_ONLY +For example, with most compilers, you could add this to your project CXXFLAGS: + -DEIGEN_MPL2_ONLY +This will cause a compilation error to be generated if you #include any code that is +LGPL licensed. diff --git a/externals/eigen/Eigen/CMakeLists.txt b/externals/eigen/Eigen/CMakeLists.txt deleted file mode 100644 index 9eb502b7..00000000 --- a/externals/eigen/Eigen/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -include(RegexUtils) -test_escape_string_as_regex() - -file(GLOB Eigen_directory_files "*") - -escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") - -foreach(f ${Eigen_directory_files}) - if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src") - list(APPEND Eigen_directory_files_to_install ${f}) - endif() -endforeach(f ${Eigen_directory_files}) - -install(FILES - ${Eigen_directory_files_to_install} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/externals/eigen/Eigen/Cholesky b/externals/eigen/Eigen/Cholesky index 369d1f5e..a318ceb7 100644 --- a/externals/eigen/Eigen/Cholesky +++ b/externals/eigen/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" @@ -31,11 +32,14 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_CHOLESKY_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/Core b/externals/eigen/Eigen/Core index 0f7fa630..5921e15f 100644 --- a/externals/eigen/Eigen/Core +++ b/externals/eigen/Eigen/Core @@ -11,232 +11,55 @@ #ifndef EIGEN_CORE_H #define EIGEN_CORE_H -// first thing Eigen does: stop the compiler from committing suicide +// first thing Eigen does: stop the compiler from reporting useless warnings. #include "src/Core/util/DisableStupidWarnings.h" -// Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! - #ifndef EIGEN_NO_DEBUG - #define EIGEN_NO_DEBUG - #endif - - #ifdef EIGEN_INTERNAL_DEBUGGING - #undef EIGEN_INTERNAL_DEBUGGING - #endif - - #ifdef EIGEN_EXCEPTIONS - #undef EIGEN_EXCEPTIONS - #endif - - // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC - #endif - -#else - #define EIGEN_DEVICE_FUNC +// then include this file where all our macros are defined. It's really important to do it first because +// it's where we do all the compiler/OS/arch detections and define most defaults. +#include "src/Core/util/Macros.h" -#endif +// This detects SSE/AVX/NEON/etc. and configure alignment settings +#include "src/Core/util/ConfigureVectorization.h" -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) - #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; +// We need cuda_runtime.h/hip_runtime.h to ensure that +// the EIGEN_USING_STD macro works properly on the device side +#if defined(EIGEN_CUDACC) + #include +#elif defined(EIGEN_HIPCC) + #include #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) - #define EIGEN_EXCEPTIONS -#endif #ifdef EIGEN_EXCEPTIONS #include #endif -// then include this file where all our macros are defined. It's really important to do it first because -// it's where we do all the alignment settings (platform detection and honoring the user's will if he -// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization. -#include "src/Core/util/Macros.h" - // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. -#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) +#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) && EIGEN_GNUC_AT_MOST(5,5) #pragma GCC optimize ("-fno-ipa-cp-clone") #endif +// Prevent ICC from specializing std::complex operators that silently fail +// on device. This allows us to use our own device-compatible specializations +// instead. +#if defined(EIGEN_COMP_ICC) && defined(EIGEN_GPU_COMPILE_PHASE) \ + && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_) +#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1 +#endif #include // this include file manages BLAS and MKL related macros // and inclusion of their respective header files #include "src/Core/util/MKL_support.h" -// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into -// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks -#if EIGEN_MAX_ALIGN_BYTES==0 - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif - -#if EIGEN_COMP_MSVC - #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // Remember that usage of defined() in a #define is undefined by the standard. - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif - #endif -#else - // Remember that usage of defined() in a #define is undefined by the standard - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) - #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC - #endif -#endif - -#ifndef EIGEN_DONT_VECTORIZE - - #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) - - // Defines symbols for compile-time detection of which instructions are - // used. - // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_SSE - #define EIGEN_VECTORIZE_SSE2 - - // Detect sse3/ssse3/sse4: - // gcc and icc defines __SSE3__, ... - // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you - // want to force the use of those instructions with msvc. - #ifdef __SSE3__ - #define EIGEN_VECTORIZE_SSE3 - #endif - #ifdef __SSSE3__ - #define EIGEN_VECTORIZE_SSSE3 - #endif - #ifdef __SSE4_1__ - #define EIGEN_VECTORIZE_SSE4_1 - #endif - #ifdef __SSE4_2__ - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif - #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512) - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_FMA - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #endif - - // include files - - // This extern "C" works around a MINGW-w64 compilation issue - // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 - // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). - // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations - // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; - // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. - // notice that since these are C headers, the extern "C" is theoretically needed anyways. - extern "C" { - // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. - // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 - #include - #else - #include - #include - #include - #ifdef EIGEN_VECTORIZE_SSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_1 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_2 - #include - #endif - #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) - #include - #endif - #endif - } // end extern "C" - #elif defined __VSX__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif defined __ALTIVEC__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ALTIVEC - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif (defined __ARM_NEON) || (defined __ARM_NEON__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_NEON - #include - #elif (defined __s390x__ && defined __VEC__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ZVECTOR - #include - #endif -#endif -#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) - // We can use the optimized fp16 to float and float to fp16 conversion routines - #define EIGEN_HAS_FP16_C +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 #endif -#if defined __CUDACC__ - #define EIGEN_VECTORIZE_CUDA - #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 - #define EIGEN_HAS_CUDA_FP16 - #endif -#endif - -#if defined EIGEN_HAS_CUDA_FP16 - #include - #include +#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16) + #define EIGEN_HAS_GPU_BF16 #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -260,7 +83,10 @@ #include #include #include -#include +#include +#ifndef EIGEN_NO_IO + #include +#endif #include #include #include @@ -268,6 +94,10 @@ // for min/max: #include +#if EIGEN_HAS_CXX11 +#include +#endif + // for std::is_nothrow_move_assignable #ifdef EIGEN_INCLUDE_TYPE_TRAITS #include @@ -283,38 +113,25 @@ #include #endif -/** \brief Namespace containing all symbols from the %Eigen library. */ -namespace Eigen { - -inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_AVX512) - return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_AVX) - return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_2) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_1) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; -#elif defined(EIGEN_VECTORIZE_SSSE3) - return "SSE, SSE2, SSE3, SSSE3"; -#elif defined(EIGEN_VECTORIZE_SSE3) - return "SSE, SSE2, SSE3"; -#elif defined(EIGEN_VECTORIZE_SSE2) - return "SSE, SSE2"; -#elif defined(EIGEN_VECTORIZE_ALTIVEC) - return "AltiVec"; -#elif defined(EIGEN_VECTORIZE_VSX) - return "VSX"; -#elif defined(EIGEN_VECTORIZE_NEON) - return "ARM NEON"; -#elif defined(EIGEN_VECTORIZE_ZVECTOR) - return "S390X ZVECTOR"; -#else - return "None"; +#if defined(EIGEN_USE_SYCL) + #undef min + #undef max + #undef isnan + #undef isinf + #undef isfinite + #include + #include + #include + #include + #include + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0 + #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16 + #endif + #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1 + #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16 + #endif #endif -} -} // end namespace Eigen #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT // This will generate an error message: @@ -323,7 +140,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t @@ -347,56 +164,90 @@ using std::ptrdiff_t; #include "src/Core/util/StaticAssert.h" #include "src/Core/util/XprHelper.h" #include "src/Core/util/Memory.h" +#include "src/Core/util/IntegralConstant.h" +#include "src/Core/util/SymbolicIndex.h" #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" +#include "src/Core/arch/Default/ConjHelper.h" +// Generic half float support +#include "src/Core/arch/Default/Half.h" +#include "src/Core/arch/Default/BFloat16.h" +#include "src/Core/arch/Default/TypeCasting.h" +#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" + #include "src/Core/arch/SSE/Complex.h" #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX512/PacketMath.h" + #include "src/Core/arch/AVX512/TypeCasting.h" + #include "src/Core/arch/AVX512/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/MathFunctions.h" - #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/MathFunctions.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/SSE/TypeCasting.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/MathFunctions.h" #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" + #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_SVE + #include "src/Core/arch/SVE/PacketMath.h" + #include "src/Core/arch/SVE/TypeCasting.h" + #include "src/Core/arch/SVE/MathFunctions.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" #include "src/Core/arch/ZVector/Complex.h" +#elif defined EIGEN_VECTORIZE_MSA + #include "src/Core/arch/MSA/PacketMath.h" + #include "src/Core/arch/MSA/MathFunctions.h" + #include "src/Core/arch/MSA/Complex.h" #endif -// Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#if defined EIGEN_VECTORIZE_GPU + #include "src/Core/arch/GPU/PacketMath.h" + #include "src/Core/arch/GPU/MathFunctions.h" + #include "src/Core/arch/GPU/TypeCasting.h" +#endif -#if defined EIGEN_VECTORIZE_CUDA - #include "src/Core/arch/CUDA/PacketMath.h" - #include "src/Core/arch/CUDA/MathFunctions.h" +#if defined(EIGEN_USE_SYCL) + #include "src/Core/arch/SYCL/SyclMemoryModel.h" + #include "src/Core/arch/SYCL/InteropHeaders.h" +#if !defined(EIGEN_DONT_VECTORIZE_SYCL) + #include "src/Core/arch/SYCL/PacketMath.h" + #include "src/Core/arch/SYCL/MathFunctions.h" + #include "src/Core/arch/SYCL/TypeCasting.h" +#endif #endif #include "src/Core/arch/Default/Settings.h" +// This file provides generic implementations valid for scalar as well +#include "src/Core/arch/Default/GenericPacketMathFunctions.h" #include "src/Core/functors/TernaryFunctors.h" #include "src/Core/functors/BinaryFunctors.h" @@ -407,9 +258,16 @@ using std::ptrdiff_t; // Specialized functors to enable the processing of complex numbers // on CUDA devices +#ifdef EIGEN_CUDACC #include "src/Core/arch/CUDA/Complex.h" +#endif -#include "src/Core/IO.h" +#include "src/Core/util/IndexedViewHelper.h" +#include "src/Core/util/ReshapedHelper.h" +#include "src/Core/ArithmeticSequence.h" +#ifndef EIGEN_NO_IO + #include "src/Core/IO.h" +#endif #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -450,6 +308,8 @@ using std::ptrdiff_t; #include "src/Core/Ref.h" #include "src/Core/Block.h" #include "src/Core/VectorBlock.h" +#include "src/Core/IndexedView.h" +#include "src/Core/Reshaped.h" #include "src/Core/Transpose.h" #include "src/Core/DiagonalMatrix.h" #include "src/Core/Diagonal.h" @@ -486,13 +346,21 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" +#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) + #include "src/Core/arch/AltiVec/MatrixProduct.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" +#include "src/Core/PartialReduxEvaluator.h" #include "src/Core/Random.h" #include "src/Core/Replicate.h" #include "src/Core/Reverse.h" #include "src/Core/ArrayWrapper.h" +#include "src/Core/StlIterators.h" #ifdef EIGEN_USE_BLAS #include "src/Core/products/GeneralMatrixMatrix_BLAS.h" diff --git a/externals/eigen/Eigen/Eigenvalues b/externals/eigen/Eigen/Eigenvalues index 009e529e..5467a2e7 100644 --- a/externals/eigen/Eigen/Eigenvalues +++ b/externals/eigen/Eigen/Eigenvalues @@ -10,14 +10,14 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" #include "LU" #include "Geometry" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Eigenvalues_Module Eigenvalues module * * @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" @@ -54,4 +58,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_EIGENVALUES_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/Geometry b/externals/eigen/Eigen/Geometry index 716d5295..bc78110a 100644 --- a/externals/eigen/Eigen/Geometry +++ b/externals/eigen/Eigen/Geometry @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "SVD" #include "LU" #include +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Geometry_Module Geometry module * * This module provides support for: @@ -49,14 +49,11 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX -#include "src/Geometry/arch/Geometry_SSE.h" +// Use the SSE optimized version whenever possible. +#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON) +#include "src/Geometry/arch/Geometry_SIMD.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_GEOMETRY_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ - diff --git a/externals/eigen/Eigen/Householder b/externals/eigen/Eigen/Householder index 89cd81b1..f2fa7996 100644 --- a/externals/eigen/Eigen/Householder +++ b/externals/eigen/Eigen/Householder @@ -27,4 +27,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_HOUSEHOLDER_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/Jacobi b/externals/eigen/Eigen/Jacobi index 17c1d785..43edc7a1 100644 --- a/externals/eigen/Eigen/Jacobi +++ b/externals/eigen/Eigen/Jacobi @@ -29,5 +29,4 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_JACOBI_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/KLUSupport b/externals/eigen/Eigen/KLUSupport new file mode 100644 index 00000000..b23d9053 --- /dev/null +++ b/externals/eigen/Eigen/KLUSupport @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#define EIGEN_KLUSUPPORT_MODULE_H + +#include + +#include + +extern "C" { +#include +#include + } + +/** \ingroup Support_modules + * \defgroup KLUSupport_Module KLUSupport module + * + * This module provides an interface to the KLU library which is part of the suitesparse package. + * It provides the following factorization class: + * - class KLU: a sparse LU factorization, well-suited for circuit simulation. + * + * \code + * #include + * \endcode + * + * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. + * The dependencies depend on how umfpack has been compiled. + * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. + * + */ + +#include "src/KLUSupport/KLUSupport.h" + +#include + +#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/externals/eigen/Eigen/LU b/externals/eigen/Eigen/LU index 6f6c5562..1236ceb0 100644 --- a/externals/eigen/Eigen/LU +++ b/externals/eigen/Eigen/LU @@ -28,19 +28,20 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX - #include "src/LU/arch/Inverse_SSE.h" +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON + #include "src/LU/arch/InverseSize4.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_LU_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/OrderingMethods b/externals/eigen/Eigen/OrderingMethods index d8ea3619..29691a62 100644 --- a/externals/eigen/Eigen/OrderingMethods +++ b/externals/eigen/Eigen/OrderingMethods @@ -63,10 +63,7 @@ * \endcode */ -#ifndef EIGEN_MPL2_ONLY #include "src/OrderingMethods/Amd.h" -#endif - #include "src/OrderingMethods/Ordering.h" #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/externals/eigen/Eigen/PaStiXSupport b/externals/eigen/Eigen/PaStiXSupport index de3a63b4..234619ac 100644 --- a/externals/eigen/Eigen/PaStiXSupport +++ b/externals/eigen/Eigen/PaStiXSupport @@ -36,6 +36,7 @@ extern "C" { * \endcode * * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. + * This wrapper resuires PaStiX version 5.x compiled without MPI support. * The dependencies depend on how PaSTiX has been compiled. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * diff --git a/externals/eigen/Eigen/QR b/externals/eigen/Eigen/QR index 80838e3b..8465b62c 100644 --- a/externals/eigen/Eigen/QR +++ b/externals/eigen/Eigen/QR @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup QR_Module QR module * * @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif @@ -44,4 +48,3 @@ #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_QR_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/QtAlignedMalloc b/externals/eigen/Eigen/QtAlignedMalloc index c6571f12..6fe82374 100644 --- a/externals/eigen/Eigen/QtAlignedMalloc +++ b/externals/eigen/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } @@ -37,4 +37,3 @@ void *qRealloc(void *ptr, std::size_t size) #endif #endif // EIGEN_QTMALLOC_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/SVD b/externals/eigen/Eigen/SVD index 86143c23..34517949 100644 --- a/externals/eigen/Eigen/SVD +++ b/externals/eigen/Eigen/SVD @@ -37,11 +37,14 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SVD_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/Eigen/Sparse b/externals/eigen/Eigen/Sparse index 136e681a..a2ef7a66 100644 --- a/externals/eigen/Eigen/Sparse +++ b/externals/eigen/Eigen/Sparse @@ -25,9 +25,7 @@ #include "SparseCore" #include "OrderingMethods" -#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" -#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/externals/eigen/Eigen/SparseCholesky b/externals/eigen/Eigen/SparseCholesky index b6a320c4..d2b1f127 100644 --- a/externals/eigen/Eigen/SparseCholesky +++ b/externals/eigen/Eigen/SparseCholesky @@ -30,16 +30,8 @@ * \endcode */ -#ifdef EIGEN_MPL2_ONLY -#error The SparseCholesky module has nothing to offer in MPL2 only mode -#endif - #include "src/SparseCholesky/SimplicialCholesky.h" - -#ifndef EIGEN_MPL2_ONLY #include "src/SparseCholesky/SimplicialCholesky_impl.h" -#endif - #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SPARSECHOLESKY_MODULE_H diff --git a/externals/eigen/Eigen/SparseLU b/externals/eigen/Eigen/SparseLU index 38b38b53..37c4a5c5 100644 --- a/externals/eigen/Eigen/SparseLU +++ b/externals/eigen/Eigen/SparseLU @@ -23,6 +23,8 @@ // Ordering interface #include "OrderingMethods" +#include "src/Core/util/DisableStupidWarnings.h" + #include "src/SparseLU/SparseLU_gemm_kernel.h" #include "src/SparseLU/SparseLU_Structs.h" @@ -43,4 +45,6 @@ #include "src/SparseLU/SparseLU_Utils.h" #include "src/SparseLU/SparseLU.h" +#include "src/Core/util/ReenableStupidWarnings.h" + #endif // EIGEN_SPARSELU_MODULE_H diff --git a/externals/eigen/Eigen/SparseQR b/externals/eigen/Eigen/SparseQR index a6f3b7f7..f5fc5fa7 100644 --- a/externals/eigen/Eigen/SparseQR +++ b/externals/eigen/Eigen/SparseQR @@ -28,7 +28,6 @@ * */ -#include "OrderingMethods" #include "src/SparseCore/SparseColEtree.h" #include "src/SparseQR/SparseQR.h" diff --git a/externals/eigen/Eigen/src/Cholesky/LDLT.h b/externals/eigen/Eigen/src/Cholesky/LDLT.h index fcee7b2e..1013ca04 100644 --- a/externals/eigen/Eigen/src/Cholesky/LDLT.h +++ b/externals/eigen/Eigen/src/Cholesky/LDLT.h @@ -16,6 +16,15 @@ namespace Eigen { namespace internal { + template struct traits > + : traits<_MatrixType> + { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; + }; + template struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -36,7 +45,7 @@ namespace internal { * matrix \f$ A \f$ such that \f$ A = P^TLDL^*P \f$, where P is a permutation matrix, L * is lower triangular with a unit diagonal and D is a diagonal matrix. * - * The decomposition uses pivoting to ensure stability, so that L will have + * The decomposition uses pivoting to ensure stability, so that D will have * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root * on D also stabilizes the computation. * @@ -44,24 +53,23 @@ namespace internal { * decomposition to determine whether a system of equations has a solution. * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix TmpMatrixType; typedef Transpositions TranspositionType; @@ -180,6 +188,7 @@ template class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax x = decompositionObject.solve(x) . @@ -191,19 +200,14 @@ template class LDLT * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function - * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular. + * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular. * * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt() */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LDLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template bool solveInPlace(MatrixBase &bAndX) const; @@ -242,13 +246,13 @@ template class LDLT */ const LDLT& adjoint() const { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \returns \c Success if computation was successful, + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { @@ -258,8 +262,10 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -305,7 +311,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + if(size==0) sign = ZeroSign; + else if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; @@ -376,6 +383,8 @@ template<> struct ldlt_inplace if((rs>0) && pivot_is_valid) A21 /= realAkk; + else if(rs>0) + ret = ret && (A21.array()==Scalar(0)).all(); if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed else if(!pivot_is_valid) found_zero_pivot = true; @@ -557,25 +566,33 @@ template template void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - matrixL().solveInPlace(dst); + // dst = L^-*T (P b) + matrixL().template conjugateIf().solveInPlace(dst); - // dst = D^-1 (L^-1 P b) + // dst = D^-* (L^-1 P b) + // dst = D^-1 (L^-*T P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); - // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon - // as motivated by LAPACK's xGELSS: + // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min()) + // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS: // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits::epsilon(),RealScalar(1) / NumTraits::highest()); // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. - RealScalar tolerance = RealScalar(1) / NumTraits::highest(); - + // Using numeric_limits::min() gives us more robustness to denormals. + RealScalar tolerance = (std::numeric_limits::min)(); for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -584,10 +601,12 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons dst.row(i).setZero(); } - // dst = L^-T (D^-1 L^-1 P b) - matrixU().solveInPlace(dst); + // dst = L^-* (D^-* L^-1 P b) + // dst = L^-T (D^-1 L^-*T P b) + matrixL().transpose().template conjugateIf().solveInPlace(dst); - // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b + // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b + // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/externals/eigen/Eigen/src/Cholesky/LLT.h b/externals/eigen/Eigen/src/Cholesky/LLT.h index 87ca8d42..8c9b2b39 100644 --- a/externals/eigen/Eigen/src/Cholesky/LLT.h +++ b/externals/eigen/Eigen/src/Cholesky/LLT.h @@ -13,6 +13,16 @@ namespace Eigen { namespace internal{ + +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + template struct LLT_Traits; } @@ -24,7 +34,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -41,27 +51,30 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 - typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits::size, @@ -96,7 +109,7 @@ template class LLT compute(matrix.derived()); } - /** \brief Constructs a LDLT factorization from a given matrix + /** \brief Constructs a LLT factorization from a given matrix * * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * \c MatrixType is a Eigen::Ref. @@ -125,6 +138,7 @@ template class LLT return Traits::getL(m_matrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -137,16 +151,11 @@ template class LLT */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LLT is not initialized."); - eigen_assert(m_matrix.rows()==b.rows() - && "LLT::solve(): invalid number of rows of the right hand side matrix b"); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -176,8 +185,8 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \returns \c Success if computation was successful, + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { @@ -190,18 +199,20 @@ template class LLT * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as: * \code x = decomposition.adjoint().solve(b) \endcode */ - const LLT& adjoint() const { return *this; }; + const LLT& adjoint() const EIGEN_NOEXCEPT { return *this; }; - inline Index rows() const { return m_matrix.rows(); } - inline Index cols() const { return m_matrix.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } template - LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); + LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -425,7 +436,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); @@ -454,7 +466,7 @@ LLT& LLT::compute(const EigenBase */ template template -LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -472,8 +484,17 @@ template template void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - dst = rhs; - solveInPlace(dst); + _solve_impl_transposed(rhs, dst); +} + +template +template +void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + dst = rhs; + + matrixL().template conjugateIf().solveInPlace(dst); + matrixU().template conjugateIf().solveInPlace(dst); } #endif @@ -485,11 +506,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); diff --git a/externals/eigen/Eigen/src/CholmodSupport/CholmodSupport.h b/externals/eigen/Eigen/src/CholmodSupport/CholmodSupport.h index 57197202..adaf5285 100644 --- a/externals/eigen/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/externals/eigen/Eigen/src/CholmodSupport/CholmodSupport.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CHOLMODSUPPORT_H #define EIGEN_CHOLMODSUPPORT_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -32,7 +32,7 @@ template<> struct cholmod_configure_matrix > { } }; -// Other scalar types are not yet suppotred by Cholmod +// Other scalar types are not yet supported by Cholmod // template<> struct cholmod_configure_matrix { // template // static void run(CholmodType& mat) { @@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref > res.dtype = 0; res.stype = -1; - + if (internal::is_same<_StorageIndex,int>::value) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,long>::value) + else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) { res.itype = CHOLMOD_LONG; } @@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref > // setup res.xtype internal::cholmod_configure_matrix<_Scalar>::run(res); - + res.stype = 0; - + return res; } @@ -121,9 +121,12 @@ template cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) { cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); - + if(UpLo==Upper) res.stype = 1; if(UpLo==Lower) res.stype = -1; + // swap stype for rowmajor matrices (only works for real matrices) + EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + if(_Options & RowMajorBit) res.stype *=-1; return res; } @@ -159,6 +162,44 @@ MappedSparseMatrix viewAsEigen(cholmod_sparse& cm) static_cast(cm.p), static_cast(cm.i),static_cast(cm.x) ); } +namespace internal { + +// template specializations for int and long that call the correct cholmod method + +#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ + template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ + template<> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } + +#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ + template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ + template<> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } + +EIGEN_CHOLMOD_SPECIALIZE0(int, start) +EIGEN_CHOLMOD_SPECIALIZE0(int, finish) + +EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L) +EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense, cholmod_dense*, X) +EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) + +EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) + +template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } +template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } + +template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } +template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } + +template +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } +template<> +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } + +#undef EIGEN_CHOLMOD_SPECIALIZE0 +#undef EIGEN_CHOLMOD_SPECIALIZE1 + +} // namespace internal + + enum CholmodMode { CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt }; @@ -195,7 +236,7 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - cholmod_start(&m_cholmod); + internal::cm_start(m_cholmod); } explicit CholmodBase(const MatrixType& matrix) @@ -203,23 +244,23 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - cholmod_start(&m_cholmod); + internal::cm_start(m_cholmod); compute(matrix); } ~CholmodBase() { if(m_cholmodFactor) - cholmod_free_factor(&m_cholmodFactor, &m_cholmod); - cholmod_finish(&m_cholmod); + internal::cm_free_factor(m_cholmodFactor, m_cholmod); + internal::cm_finish(m_cholmod); } - + inline StorageIndex cols() const { return internal::convert_index(m_cholmodFactor->n); } inline StorageIndex rows() const { return internal::convert_index(m_cholmodFactor->n); } - + /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -235,29 +276,29 @@ class CholmodBase : public SparseSolverBase factorize(matrix); return derived(); } - + /** Performs a symbolic decomposition on the sparsity pattern of \a matrix. * * This function is particularly useful when solving for several problems having the same structure. - * + * * \sa factorize() */ void analyzePattern(const MatrixType& matrix) { if(m_cholmodFactor) { - cholmod_free_factor(&m_cholmodFactor, &m_cholmod); + internal::cm_free_factor(m_cholmodFactor, m_cholmod); m_cholmodFactor = 0; } cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - m_cholmodFactor = cholmod_analyze(&A, &m_cholmod); - + m_cholmodFactor = internal::cm_analyze(A, m_cholmod); + this->m_isInitialized = true; this->m_info = Success; m_analysisIsOk = true; m_factorizationIsOk = false; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. @@ -268,17 +309,17 @@ class CholmodBase : public SparseSolverBase { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod); + internal::cm_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod); // If the factorization failed, minor is the column at which it did. On success minor == n. this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue); m_factorizationIsOk = true; } - + /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations. * See the Cholmod user guide for details. */ cholmod_common& cholmod() { return m_cholmod; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal */ template @@ -288,22 +329,23 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); - - // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref. + + // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref. Ref > b_ref(b.derived()); cholmod_dense b_cd = viewAsCholmod(b_ref); - cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod); + cholmod_dense* x_cd = internal::cm_solve(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod); if(!x_cd) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) + // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve dest = Matrix::Map(reinterpret_cast(x_cd->x),b.rows(),b.cols()); - cholmod_free_dense(&x_cd, &m_cholmod); + internal::cm_free_dense(x_cd, m_cholmod); } - + /** \internal */ template void _solve_impl(const SparseMatrixBase &b, SparseMatrixBase &dest) const @@ -316,19 +358,20 @@ class CholmodBase : public SparseSolverBase // note: cs stands for Cholmod Sparse Ref > b_ref(b.const_cast_derived()); cholmod_sparse b_cs = viewAsCholmod(b_ref); - cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod); + cholmod_sparse* x_cs = internal::cm_spsolve(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod); if(!x_cs) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) + // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver) dest.derived() = viewAsEigen(*x_cs); - cholmod_free_sparse(&x_cs, &m_cholmod); + internal::cm_free_sparse(x_cs, m_cholmod); } #endif // EIGEN_PARSED_BY_DOXYGEN - - + + /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization. * * During the numerical factorization, an offset term is added to the diagonal coefficients:\n @@ -343,7 +386,7 @@ class CholmodBase : public SparseSolverBase m_shiftOffset[0] = double(offset); return derived(); } - + /** \returns the determinant of the underlying matrix from the current factorization */ Scalar determinant() const { @@ -398,7 +441,7 @@ class CholmodBase : public SparseSolverBase template void dumpMemory(Stream& /*s*/) {} - + protected: mutable cholmod_common m_cholmod; cholmod_factor* m_cholmodFactor; @@ -435,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLLT() : Base() { init(); } CholmodSimplicialLLT(const MatrixType& matrix) : Base() @@ -486,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLDLT() : Base() { init(); } CholmodSimplicialLDLT(const MatrixType& matrix) : Base() @@ -535,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper { typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSupernodalLLT() : Base() { init(); } CholmodSupernodalLLT(const MatrixType& matrix) : Base() @@ -586,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom { typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodDecomposition() : Base() { init(); } CholmodDecomposition(const MatrixType& matrix) : Base() @@ -600,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom } ~CholmodDecomposition() {} - + void setMode(CholmodMode mode) { switch(mode) diff --git a/externals/eigen/Eigen/src/Core/ArithmeticSequence.h b/externals/eigen/Eigen/src/Core/ArithmeticSequence.h new file mode 100644 index 00000000..b6200fac --- /dev/null +++ b/externals/eigen/Eigen/src/Core/ArithmeticSequence.h @@ -0,0 +1,413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARITHMETIC_SEQUENCE_H +#define EIGEN_ARITHMETIC_SEQUENCE_H + +namespace Eigen { + +namespace internal { + +#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) +template struct aseq_negate {}; + +template<> struct aseq_negate { + typedef Index type; +}; + +template struct aseq_negate > { + typedef FixedInt<-N> type; +}; + +// Compilation error in the following case: +template<> struct aseq_negate > {}; + +template::value, + bool SizeIsSymbolic =symbolic::is_symbolic::value> +struct aseq_reverse_first_type { + typedef Index type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr > + > type; +}; + +template +struct aseq_reverse_first_type_aux { + typedef Index type; +}; + +template +struct aseq_reverse_first_type_aux::type> { + typedef FixedInt<(SizeType::value-1)*IncrType::value> type; +}; + +template +struct aseq_reverse_first_type { + typedef typename aseq_reverse_first_type_aux::type Aux; + typedef symbolic::AddExpr > type; +}; + +template +struct aseq_reverse_first_type { + typedef symbolic::AddExpr > >, + symbolic::ValueExpr >, + symbolic::ValueExpr<> > type; +}; +#endif + +// Helper to cleanup the type of the increment: +template struct cleanup_seq_incr { + typedef typename cleanup_index_type::type type; +}; + +} + +//-------------------------------------------------------------------------------- +// seq(first,last,incr) and seqN(first,size,incr) +//-------------------------------------------------------------------------------- + +template > +class ArithmeticSequence; + +template +ArithmeticSequence::type, + typename internal::cleanup_index_type::type, + typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr); + +/** \class ArithmeticSequence + * \ingroup Core_Module + * + * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by + * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride) + * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i. + * + * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments + * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the + * only way it is used. + * + * \tparam FirstType type of the first element, usually an Index, + * but internally it can be a symbolic expression + * \tparam SizeType type representing the size of the sequence, usually an Index + * or a compile time integral constant. Internally, it can also be a symbolic expression + * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1) + * + * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView + */ +template +class ArithmeticSequence +{ +public: + ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {} + ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {} + + enum { + SizeAtCompileTime = internal::get_fixed_value::value, + IncrAtCompileTime = internal::get_fixed_value::value + }; + + /** \returns the size, i.e., number of elements, of the sequence */ + Index size() const { return m_size; } + + /** \returns the first element \f$ a_0 \f$ in the sequence */ + Index first() const { return m_first; } + + /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */ + Index operator[](Index i) const { return m_first + i * m_incr; } + + const FirstType& firstObject() const { return m_first; } + const SizeType& sizeObject() const { return m_size; } + const IncrType& incrObject() const { return m_incr; } + +protected: + FirstType m_first; + SizeType m_size; + IncrType m_incr; + +public: + +#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) + auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#else +protected: + typedef typename internal::aseq_negate::type ReverseIncrType; + typedef typename internal::aseq_reverse_first_type::type ReverseFirstType; +public: + ArithmeticSequence + reverse() const { + return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); + } +#endif +}; + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type > +seqN(FirstType first, SizeType size, IncrType incr) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type>(first,size,incr); +} + +/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */ +template +ArithmeticSequence::type,typename internal::cleanup_index_type::type > +seqN(FirstType first, SizeType size) { + return ArithmeticSequence::type,typename internal::cleanup_index_type::type>(first,size); +} + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr + * + * It is essentially an alias to: + * \code + * seqN(f, (l-f+incr)/incr, incr); + * \endcode + * + * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) + */ +template +auto seq(FirstType f, LastType l, IncrType incr); + +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment + * + * It is essentially an alias to: + * \code + * seqN(f,l-f+1); + * \endcode + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) + */ +template +auto seq(FirstType f, LastType l); + +#else // EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX11 +template +auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+fix<1>()))) +{ + return seqN(typename internal::cleanup_index_type::type(f), + (typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +auto seq(FirstType f, LastType l, IncrType incr) + -> decltype(seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + - typename internal::cleanup_index_type::type(f)+typename internal::cleanup_seq_incr::type(incr) + ) / typename internal::cleanup_seq_incr::type(incr), + typename internal::cleanup_seq_incr::type(incr))) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + ( typename internal::cleanup_index_type::type(l) + -typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), + CleanedIncrType(incr)); +} + +#else // EIGEN_HAS_CXX11 + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index> >::type +seq(FirstType f, LastType l) +{ + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+fix<1>()))); +} + +template +typename internal::enable_if::value, + ArithmeticSequence,symbolic::ValueExpr<> >, + symbolic::ValueExpr > > > >::type +seq(const symbolic::BaseExpr &f, LastType l) +{ + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::AddExpr >, + symbolic::ValueExpr > > > >::type +seq(FirstType f, const symbolic::BaseExpr &l) +{ + return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); +} + +template +ArithmeticSequence >,symbolic::ValueExpr > > > +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) +{ + return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); +} + + +template +typename internal::enable_if::value || symbolic::is_symbolic::value), + ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence, + symbolic::ValueExpr<> >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +typename internal::enable_if::value, + ArithmeticSequence::type, + symbolic::QuotientExpr >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> >::type +seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(typename internal::cleanup_index_type::type(f), + (l.derived()-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} + +template +ArithmeticSequence >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, + typename internal::cleanup_seq_incr::type> +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) +{ + typedef typename internal::cleanup_seq_incr::type CleanedIncrType; + return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); +} +#endif // EIGEN_HAS_CXX11 + +#endif // EIGEN_PARSED_BY_DOXYGEN + + +#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN) +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. + * + * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode + * + * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +auto lastN(SizeType size, IncrType incr) +-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +{ + return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); +} + +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. + * + * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode + * + * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ +template +auto lastN(SizeType size) +-> decltype(seqN(Eigen::last+fix<1>()-size, size)) +{ + return seqN(Eigen::last+fix<1>()-size, size); +} +#endif + +namespace internal { + +// Convert a symbolic span into a usable one (i.e., remove last/end "keywords") +template +struct make_size_type { + typedef typename internal::conditional::value, Index, T>::type type; +}; + +template +struct IndexedViewCompatibleType, XprSize> { + typedef ArithmeticSequence::type,IncrType> type; +}; + +template +ArithmeticSequence::type,IncrType> +makeIndexedViewCompatible(const ArithmeticSequence& ids, Index size,SpecializedType) { + return ArithmeticSequence::type,IncrType>( + eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject()); +} + +template +struct get_compile_time_incr > { + enum { value = get_fixed_value::value }; +}; + +} // end namespace internal + +/** \namespace Eigen::indexing + * \ingroup Core_Module + * + * The sole purpose of this namespace is to be able to import all functions + * and symbols that are expected to be used within operator() for indexing + * and slicing. If you already imported the whole Eigen namespace: + * \code using namespace Eigen; \endcode + * then you are already all set. Otherwise, if you don't want/cannot import + * the whole Eigen namespace, the following line: + * \code using namespace Eigen::indexing; \endcode + * is equivalent to: + * \code + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + using Eigen::lastN; // c++11 only + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; + \endcode + */ +namespace indexing { + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + #if EIGEN_HAS_CXX11 + using Eigen::lastN; + #endif + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; +} + +} // end namespace Eigen + +#endif // EIGEN_ARITHMETIC_SEQUENCE_H diff --git a/externals/eigen/Eigen/src/Core/Array.h b/externals/eigen/Eigen/src/Core/Array.h index 0d34269f..20c789b1 100644 --- a/externals/eigen/Eigen/src/Core/Array.h +++ b/externals/eigen/Eigen/src/Core/Array.h @@ -117,7 +117,7 @@ class Array { return Base::_set(other); } - + /** Default constructor. * * For fixed-size matrices, does nothing. @@ -153,17 +153,54 @@ class Array : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif + #if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + * + * Example: \include Array_variadic_ctor_cxx11.cpp + * Output: \verbinclude Array_variadic_ctor_cxx11.out + * + * \sa Array(const std::initializer_list>&) + * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Array_initializer_list_23_cxx11.cpp + * Output: \verbinclude Array_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. + * Therefore Array{{1,2,3,4,5}} is legal and the more verbose syntax + * Array{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Array_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Array_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, + * and implicit transposition is allowed for compile-time 1D arrays only. + * + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const std::initializer_list>& list) : Base(list) {} + #endif // end EIGEN_HAS_CXX11 + #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -180,6 +217,7 @@ class Array Base::_check_template_params(); this->template _init2(val0, val1); } + #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -191,7 +229,8 @@ class Array */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient */ + /** constructs an initialized 1x1 Array with the given coefficient + * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ Array(const Scalar& value); /** constructs an uninitialized array with \a rows rows and \a cols columns. * @@ -199,11 +238,14 @@ class Array * it is redundant to pass these parameters, so one should use the default constructor * Array() instead. */ Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients */ + /** constructs an initialized 2D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ Array(const Scalar& val0, const Scalar& val1); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** constructs an initialized 3D vector with given coefficients */ + /** constructs an initialized 3D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -213,7 +255,9 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients */ + /** constructs an initialized 4D vector with given coefficients + * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { @@ -231,15 +275,23 @@ class Array : Base(other) { } + private: + struct PrivateType {}; + public: + /** \sa MatrixBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const EigenBase &other) + EIGEN_STRONG_INLINE Array(const EigenBase &other, + typename internal::enable_if::value, + PrivateType>::type = PrivateType()) : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT{ return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } #ifdef EIGEN_ARRAY_PLUGIN #include EIGEN_ARRAY_PLUGIN @@ -254,7 +306,7 @@ class Array /** \defgroup arraytypedefs Global array typedefs * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common 1D and 2D array types. + * %Eigen defines several typedef shortcuts for most common 1D and 2D array types. * * The general patterns are the following: * @@ -267,6 +319,12 @@ class Array * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is * a fixed-size 1D array of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `ArrayRowsCols` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size. + * - `ArraySize` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays. + * * \sa class Array */ @@ -299,8 +357,42 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS + +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix##SizeSuffix = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##SizeSuffix = Array; + +#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##Size##X = Array; \ +/** \ingroup arraytypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Array##X##Size = Array; + +EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2) +EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3) +EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4) +EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3) +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) + +#undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE +#endif // EIGEN_HAS_CXX11 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ diff --git a/externals/eigen/Eigen/src/Core/ArrayBase.h b/externals/eigen/Eigen/src/Core/ArrayBase.h index f0232f65..ea3dd1c3 100644 --- a/externals/eigen/Eigen/src/Core/ArrayBase.h +++ b/externals/eigen/Eigen/src/Core/ArrayBase.h @@ -69,6 +69,7 @@ template class ArrayBase using Base::coeff; using Base::coeffRef; using Base::lazyAssign; + using Base::operator-; using Base::operator=; using Base::operator+=; using Base::operator-=; @@ -88,7 +89,6 @@ template class ArrayBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/ArrayCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" @@ -153,8 +153,8 @@ template class ArrayBase // inline void evalTo(Dest& dst) const { dst = matrix(); } protected: - EIGEN_DEVICE_FUNC - ArrayBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase) private: explicit ArrayBase(Index); @@ -175,7 +175,7 @@ template class ArrayBase */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator-=(const ArrayBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -188,7 +188,7 @@ ArrayBase::operator-=(const ArrayBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator+=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -201,7 +201,7 @@ ArrayBase::operator+=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator*=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::mul_assign_op()); @@ -214,7 +214,7 @@ ArrayBase::operator*=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator/=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::div_assign_op()); diff --git a/externals/eigen/Eigen/src/Core/ArrayWrapper.h b/externals/eigen/Eigen/src/Core/ArrayWrapper.h index a04521a1..2e9555b5 100644 --- a/externals/eigen/Eigen/src/Core/ArrayWrapper.h +++ b/externals/eigen/Eigen/src/Core/ArrayWrapper.h @@ -10,7 +10,7 @@ #ifndef EIGEN_ARRAYWRAPPER_H #define EIGEN_ARRAYWRAPPER_H -namespace Eigen { +namespace Eigen { /** \class ArrayWrapper * \ingroup Core_Module @@ -32,7 +32,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } @@ -59,14 +60,14 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -89,9 +90,9 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all::type& EIGEN_DEVICE_FUNC - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } @@ -129,7 +130,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } @@ -156,14 +158,14 @@ class MatrixWrapper : public MatrixBase > EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } @@ -183,8 +185,8 @@ class MatrixWrapper : public MatrixBase > } EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const + const typename internal::remove_all::type& + nestedExpression() const { return m_expression; } diff --git a/externals/eigen/Eigen/src/Core/Assign.h b/externals/eigen/Eigen/src/Core/Assign.h index 53806ba3..655412ef 100644 --- a/externals/eigen/Eigen/src/Core/Assign.h +++ b/externals/eigen/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/externals/eigen/Eigen/src/Core/AssignEvaluator.h b/externals/eigen/Eigen/src/Core/AssignEvaluator.h index b0ec7b7c..7d76f0c2 100644 --- a/externals/eigen/Eigen/src/Core/AssignEvaluator.h +++ b/externals/eigen/Eigen/src/Core/AssignEvaluator.h @@ -17,29 +17,29 @@ namespace Eigen { // This implementation is based on Assign.h namespace internal { - + /*************************************************************************** * Part 1 : the logic deciding a strategy for traversal and unrolling * ***************************************************************************/ // copy_using_evaluator_traits is based on assign_traits -template +template struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; typedef typename Dst::Scalar DstScalar; - + enum { DstFlags = DstEvaluator::Flags, SrcFlags = SrcEvaluator::Flags }; - + public: enum { DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, - DstHasDirectAccess = DstFlags & DirectAccessBit, + DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) }; @@ -51,13 +51,15 @@ struct copy_using_evaluator_traits InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), + RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize), + RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize), OuterStride = int(outer_stride_at_compile_time::ret), MaxSizeAtCompileTime = Dst::SizeAtCompileTime }; // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet::type LinearPacketType; - typedef typename find_best_packet::type InnerPacketType; + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; enum { LinearPacketSize = unpacket_traits::size, @@ -83,7 +85,7 @@ struct copy_using_evaluator_traits && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ @@ -97,7 +99,8 @@ struct copy_using_evaluator_traits public: enum { - Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + Traversal = int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) // If compile-size is zero, traversing will fail at compile-time. + : (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) @@ -135,7 +138,7 @@ struct copy_using_evaluator_traits ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) - ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) + ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) #if EIGEN_UNALIGNED_VECTORIZE : int(Traversal) == int(SliceVectorizedTraversal) @@ -172,6 +175,8 @@ struct copy_using_evaluator_traits EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) @@ -195,7 +200,7 @@ struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime @@ -261,7 +266,7 @@ struct copy_using_evaluator_innervec_CompleteUnrolling typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - + enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, @@ -312,6 +317,22 @@ template struct dense_assignment_loop; +/************************ +***** Special Cases ***** +************************/ + +// Zero-sized assignment is a no-op. +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel& /*kernel*/) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + EIGEN_STATIC_ASSERT(int(DstXprType::SizeAtCompileTime) == 0, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT) + } +}; + /************************ *** Default traversal *** ************************/ @@ -426,10 +447,10 @@ struct dense_assignment_loop::size, - alignedSize = (size/packetSize)*packetSize }; + alignedSize = (int(size)/packetSize)*packetSize }; copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); @@ -530,7 +551,7 @@ struct dense_assignment_loop const Scalar *dst_ptr = kernel.dstDataPtr(); if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) { - // the pointer is not aligend-on scalar, so alignment is not possible + // the pointer is not aligned-on scalar, so alignment is not possible return dense_assignment_loop::run(kernel); } const Index packetAlignedMask = packetSize - 1; @@ -568,14 +589,15 @@ struct dense_assignment_loop typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; - enum { size = DstXprType::InnerSizeAtCompileTime, + enum { innerSize = DstXprType::InnerSizeAtCompileTime, packetSize =unpacket_traits::size, - vectorizableSize = (size/packetSize)*packetSize }; + vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize), + size = DstXprType::SizeAtCompileTime }; for(Index outer = 0; outer < kernel.outerSize(); ++outer) { copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); - copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); } } }; @@ -599,73 +621,74 @@ class generic_dense_assignment_kernel typedef typename DstEvaluatorTypeT::XprType DstXprType; typedef typename SrcEvaluatorTypeT::XprType SrcXprType; public: - + typedef DstEvaluatorTypeT DstEvaluatorType; typedef SrcEvaluatorTypeT SrcEvaluatorType; typedef typename DstEvaluatorType::Scalar Scalar; typedef copy_using_evaluator_traits AssignmentTraits; typedef typename AssignmentTraits::PacketType PacketType; - - - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) { #ifdef EIGEN_DEBUG_ASSIGN AssignmentTraits::debug(); #endif } - - EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); } - EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); } - EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); } - - EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; } - EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; } - + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_dstExpr.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const EIGEN_NOEXCEPT { return m_dstExpr.innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const EIGEN_NOEXCEPT { return m_dstExpr.outerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dstExpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_dstExpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerStride() const EIGEN_NOEXCEPT { return m_dstExpr.outerStride(); } + + EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() EIGEN_NOEXCEPT { return m_dst; } + EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const EIGEN_NOEXCEPT { return m_src; } + /// Assign src(row,col) to dst(row,col) through the assignment functor. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) { m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) { m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index)); } - + /// \sa assignCoeff(Index,Index) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); - Index col = colIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); assignCoeff(row, col); } - - + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { m_functor.template assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) { m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } - + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { - Index row = rowIndexByOuterInner(outer, inner); + Index row = rowIndexByOuterInner(outer, inner); Index col = colIndexByOuterInner(outer, inner); assignPacket(row, col); } - + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; @@ -688,7 +711,7 @@ class generic_dense_assignment_kernel { return m_dstExpr.data(); } - + protected: DstEvaluatorType& m_dst; const SrcEvaluatorType& m_src; @@ -697,6 +720,27 @@ class generic_dense_assignment_kernel DstXprType& m_dstExpr; }; +// Special kernel used when computing small products whose operands have dynamic dimensions. It ensures that the +// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used +// when computing the product. + +template +class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel +{ +protected: + typedef generic_dense_assignment_kernel Base; + public: + typedef typename Base::Scalar Scalar; + typedef typename Base::DstXprType DstXprType; + typedef copy_using_evaluator_traits AssignmentTraits; + typedef typename AssignmentTraits::PacketType PacketType; + + EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + : Base(dst, src, func, dstExpr) + { + } + }; + /*************************************************************************** * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ @@ -734,13 +778,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType resize_if_allowed(dst, src, func); DstEvaluatorType dstEvaluator(dst); - + typedef generic_dense_assignment_kernel Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); dense_assignment_loop::run(kernel); } +// Specialization for filling the destination with a constant value. +#ifndef EIGEN_GPU_COMPILE_PHASE +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp, DstXprType>& src, const internal::assign_op& func) +{ + resize_if_allowed(dst, src, func); + std::fill_n(dst.data(), dst.size(), src.functor()()); +} +#endif + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) { @@ -756,13 +810,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; template struct AssignmentKind { typedef EigenBase2EigenBase Kind; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; - + // This is the main assignment class template< typename DstXprType, typename SrcXprType, typename Functor, typename Kind = typename AssignmentKind< typename evaluator_traits::Shape , typename evaluator_traits::Shape >::Kind, @@ -787,7 +841,7 @@ void call_assignment(const Dst& dst, const Src& src) { call_assignment(dst, src, internal::assign_op()); } - + // Deal with "assume-aliasing" template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -827,14 +881,35 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) typedef typename internal::conditional, Dst>::type ActualDstTypeCleaned; typedef typename internal::conditional, Dst&>::type ActualDstType; ActualDstType actualDst(dst); - + // TODO check whether this is the right place to perform these checks: EIGEN_STATIC_ASSERT_LVALUE(Dst) EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src) EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar); - + Assignment::run(actualDst, src, func); } + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +{ + typedef evaluator DstEvaluatorType; + typedef evaluator SrcEvaluatorType; + typedef restricted_packet_dense_assignment_kernel Kernel; + + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); + + SrcEvaluatorType srcEvaluator(src); + resize_if_allowed(dst, src, func); + + DstEvaluatorType dstEvaluator(dst); + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + + dense_assignment_loop::run(kernel); +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) @@ -875,7 +950,7 @@ struct Assignment #ifndef EIGEN_NO_DEBUG internal::check_for_aliasing(dst, src); #endif - + call_dense_assignment_loop(dst, src, func); } }; @@ -899,7 +974,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/externals/eigen/Eigen/src/Core/Assign_MKL.h b/externals/eigen/Eigen/src/Core/Assign_MKL.h index 6c2ab926..c6140d18 100644 --- a/externals/eigen/Eigen/src/Core/Assign_MKL.h +++ b/externals/eigen/Eigen/src/Core/Assign_MKL.h @@ -68,27 +68,28 @@ class vml_assign_traits #define EIGEN_PP_EXPAND(ARG) ARG #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1) -#define EIGEN_VMLMODE_EXPAND_LA , VML_HA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA #else -#define EIGEN_VMLMODE_EXPAND_LA , VML_LA +#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA #endif -#define EIGEN_VMLMODE_EXPAND__ +#define EIGEN_VMLMODE_EXPAND_x_ -#define EIGEN_VMLMODE_PREFIX_LA vm -#define EIGEN_VMLMODE_PREFIX__ v -#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE) +#define EIGEN_VMLMODE_PREFIX_xLA vm +#define EIGEN_VMLMODE_PREFIX_x_ v +#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE) #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \ template< typename DstXprType, typename SrcXprNested> \ struct Assignment, SrcXprNested>, assign_op, \ Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -96,7 +97,7 @@ class vml_assign_traits &(src.nestedExpression().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ @@ -144,13 +145,14 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ VMLTYPE exponent = reinterpret_cast(src.rhs().functor().m_other); \ if(vml_assign_traits::Traversal==LinearTraversal) \ { \ VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent, \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -158,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) &(src.lhs().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ } \ } \ } \ diff --git a/externals/eigen/Eigen/src/Core/BandMatrix.h b/externals/eigen/Eigen/src/Core/BandMatrix.h index 4978c914..878c0240 100644 --- a/externals/eigen/Eigen/src/Core/BandMatrix.h +++ b/externals/eigen/Eigen/src/Core/BandMatrix.h @@ -10,7 +10,7 @@ #ifndef EIGEN_BANDMATRIX_H #define EIGEN_BANDMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -45,7 +45,7 @@ class BandMatrixBase : public EigenBase }; public: - + using Base::derived; using Base::rows; using Base::cols; @@ -55,10 +55,10 @@ class BandMatrixBase : public EigenBase /** \returns the number of sub diagonals */ inline Index subs() const { return derived().subs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline const CoefficientsType& coeffs() const { return derived().coeffs(); } - + /** \returns an expression of the underlying coefficient matrix */ inline CoefficientsType& coeffs() { return derived().coeffs(); } @@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase * \warning the internal storage must be column major. */ inline Block col(Index i) { - EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); Index start = 0; Index len = coeffs().rows(); if (i<=supers()) @@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase template struct DiagonalIntReturnType { enum { - ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)), + ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)), Conjugate = ReturnOpposite && NumTraits::IsComplex, ActualIndex = ReturnOpposite ? -Index : Index, DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) @@ -130,7 +130,7 @@ class BandMatrixBase : public EigenBase eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers())); return Block(coeffs(), supers()-i, std::max(0,i), 1, diagonalLength(i)); } - + template inline void evalTo(Dest& dst) const { dst.resize(rows(),cols()); @@ -192,7 +192,7 @@ struct traits > Options = _Options, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef Matrix CoefficientsType; + typedef Matrix CoefficientsType; }; template @@ -211,16 +211,16 @@ class BandMatrix : public BandMatrixBase @@ -52,7 +52,7 @@ struct traits > : traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, // FIXME DirectAccessBit should not be handled by expressions - // + // // Alignment is needed by MapBase's assertions // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator Alignment = 0 @@ -61,7 +61,7 @@ struct traits > : traits::ret> class BlockImpl_dense; - + } // end namespace internal template class BlockImpl; @@ -109,13 +109,13 @@ template class typedef Impl Base; EIGEN_GENERIC_PUBLIC_INTERFACE(Block) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) - + typedef typename internal::remove_all::type NestedExpression; - + /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index i) : Impl(xpr,i) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index i) : Impl(xpr,i) { eigen_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i class /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) @@ -135,8 +135,8 @@ template class /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline Block(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Block(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) @@ -147,7 +147,7 @@ template class && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols); } }; - + // The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense // that must be specialized for direct and non-direct access... template @@ -159,10 +159,10 @@ class BlockImpl public: typedef Impl Base; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} - EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} EIGEN_DEVICE_FUNC - inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) {} }; @@ -294,25 +294,25 @@ template::type& nestedExpression() const - { - return m_xpr; + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT + { + return m_startRow.value(); } - - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT + { + return m_startCol.value(); } protected: @@ -342,9 +342,9 @@ class BlockImpl_dense /** Column or Row constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index i) + : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), @@ -357,8 +357,8 @@ class BlockImpl_dense /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -367,8 +367,8 @@ class BlockImpl_dense /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), @@ -377,18 +377,18 @@ class BlockImpl_dense init(); } - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const - { - return m_xpr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& nestedExpression() const EIGEN_NOEXCEPT + { + return m_xpr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; } - + /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index innerStride() const EIGEN_NOEXCEPT { return internal::traits::HasSameStorageOrderAsXprType ? m_xpr.innerStride() @@ -396,23 +396,19 @@ class BlockImpl_dense } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { - return m_outerStride; + return internal::traits::HasSameStorageOrderAsXprType + ? m_xpr.outerStride() + : m_xpr.innerStride(); } - EIGEN_DEVICE_FUNC - StorageIndex startRow() const - { - return m_startRow.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startRow() const EIGEN_NOEXCEPT { return m_startRow.value(); } - EIGEN_DEVICE_FUNC - StorageIndex startCol() const - { - return m_startCol.value(); - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + StorageIndex startCol() const EIGEN_NOEXCEPT { return m_startCol.value(); } #ifndef __SUNPRO_CC // FIXME sunstudio is not friendly with the above friend... @@ -422,8 +418,8 @@ class BlockImpl_dense #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal used by allowAligned() */ - EIGEN_DEVICE_FUNC - inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) : Base(data, blockRows, blockCols), m_xpr(xpr) { init(); @@ -431,7 +427,7 @@ class BlockImpl_dense #endif protected: - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void init() { m_outerStride = internal::traits::HasSameStorageOrderAsXprType diff --git a/externals/eigen/Eigen/src/Core/BooleanRedux.h b/externals/eigen/Eigen/src/Core/BooleanRedux.h index 8409d874..852de8b9 100644 --- a/externals/eigen/Eigen/src/Core/BooleanRedux.h +++ b/externals/eigen/Eigen/src/Core/BooleanRedux.h @@ -14,58 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(row, col); } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &/*mat*/) { return true; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { - typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Traits::RowsAtCompileTime, - row = (UnrollCount-1) % Traits::RowsAtCompileTime + col = (UnrollCount-1) / Rows, + row = (UnrollCount-1) % Rows }; - static inline bool run(const Derived &mat) + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(row, col); } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived & /*mat*/) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { - static inline bool run(const Derived &) { return false; } + EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; } // end namespace internal @@ -78,16 +76,16 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::run(evaluator); + return internal::all_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -102,16 +100,16 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::run(evaluator); + return internal::any_unroller::RowsAtCompileTime>::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -126,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/externals/eigen/Eigen/src/Core/CommaInitializer.h b/externals/eigen/Eigen/src/Core/CommaInitializer.h index d218e981..c0e29c75 100644 --- a/externals/eigen/Eigen/src/Core/CommaInitializer.h +++ b/externals/eigen/Eigen/src/Core/CommaInitializer.h @@ -33,6 +33,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const Scalar& s) : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) { + eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.coeffRef(0,0) = s; } @@ -41,6 +43,8 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const DenseBase& other) : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows()) { + eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() + && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.block(0, 0, other.rows(), other.cols()) = other; } @@ -103,7 +107,7 @@ struct CommaInitializer EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception) #endif { - finished(); + finished(); } /** \returns the built matrix once all its coefficients have been set. @@ -141,7 +145,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +153,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/externals/eigen/Eigen/src/Core/ConditionEstimator.h b/externals/eigen/Eigen/src/Core/ConditionEstimator.h index aa7efdc7..51a2e5f1 100644 --- a/externals/eigen/Eigen/src/Core/ConditionEstimator.h +++ b/externals/eigen/Eigen/src/Core/ConditionEstimator.h @@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1); + if (dec.rows() == 0) return NumTraits::infinity(); if (matrix_norm == RealScalar(0)) return RealScalar(0); if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); diff --git a/externals/eigen/Eigen/src/Core/CoreEvaluators.h b/externals/eigen/Eigen/src/Core/CoreEvaluators.h index f7c1effc..0ff8c8de 100644 --- a/externals/eigen/Eigen/src/Core/CoreEvaluators.h +++ b/externals/eigen/Eigen/src/Core/CoreEvaluators.h @@ -14,7 +14,7 @@ #define EIGEN_COREEVALUATORS_H namespace Eigen { - + namespace internal { // This class returns the evaluator kind from the expression storage kind. @@ -63,8 +63,8 @@ template< typename T, template< typename T, typename Kind = typename evaluator_traits::Kind, typename Scalar = typename T::Scalar> struct unary_evaluator; - -// evaluator_traits contains traits for evaluator + +// evaluator_traits contains traits for evaluator template struct evaluator_traits_base @@ -90,7 +90,8 @@ template struct evaluator : public unary_evaluator { typedef unary_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const T& xpr) : Base(xpr) {} }; @@ -99,21 +100,29 @@ template struct evaluator : evaluator { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator(xpr) {} }; // ---------- base class for all evaluators ---------- template -struct evaluator_base : public noncopyable +struct evaluator_base { // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits ExpressionTraits; - + enum { Alignment = 0 }; + // noncopyable: + // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization) + // and make complex evaluator much larger than then should do. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {} +private: + EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&); + EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&); }; // -------------------- Matrix and Array -------------------- @@ -123,6 +132,33 @@ struct evaluator_base : public noncopyable // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense, // so no need for more sophisticated dispatching. +// this helper permits to completely eliminate m_outerStride if it is known at compiletime. +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + { +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif + eigen_internal_assert(outerStride==OuterStride); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index outerStride() const EIGEN_NOEXCEPT { return OuterStride; } + const Scalar *data; +}; + +template class plainobjectbase_evaluator_data { +public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return m_outerStride; } + const Scalar *data; +protected: + Index m_outerStride; +}; + template struct evaluator > : evaluator_base @@ -136,23 +172,28 @@ struct evaluator > IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime, RowsAtCompileTime = PlainObjectType::RowsAtCompileTime, ColsAtCompileTime = PlainObjectType::ColsAtCompileTime, - + CoeffReadCost = NumTraits::ReadCost, Flags = traits::EvaluatorFlags, Alignment = traits::Alignment }; - - EIGEN_DEVICE_FUNC evaluator() - : m_data(0), - m_outerStride(IsVectorAtCompileTime ? 0 - : int(IsRowMajor) ? ColsAtCompileTime - : RowsAtCompileTime) + enum { + // We do not need to know the outer stride for vectors + OuterStrideAtCompileTime = IsVectorAtCompileTime ? 0 + : int(IsRowMajor) ? ColsAtCompileTime + : RowsAtCompileTime + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() + : m_d(0,OuterStrideAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - - EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) - : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const PlainObjectType& m) + : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -161,30 +202,30 @@ struct evaluator > CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) - return m_data[row * m_outerStride.value() + col]; + return m_d.data[row * m_d.outerStride() + col]; else - return m_data[row + col * m_outerStride.value()]; + return m_d.data[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; + return m_d.data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) - return const_cast(m_data)[row * m_outerStride.value() + col]; + return const_cast(m_d.data)[row * m_d.outerStride() + col]; else - return const_cast(m_data)[row + col * m_outerStride.value()]; + return const_cast(m_d.data)[row + col * m_d.outerStride()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return const_cast(m_data)[index]; + return const_cast(m_d.data)[index]; } template @@ -192,16 +233,16 @@ struct evaluator > PacketType packet(Index row, Index col) const { if (IsRowMajor) - return ploadt(m_data + row * m_outerStride.value() + col); + return ploadt(m_d.data + row * m_d.outerStride() + col); else - return ploadt(m_data + row + col * m_outerStride.value()); + return ploadt(m_d.data + row + col * m_d.outerStride()); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return ploadt(m_data + index); + return ploadt(m_d.data + index); } template @@ -210,26 +251,22 @@ struct evaluator > { if (IsRowMajor) return pstoret - (const_cast(m_data) + row * m_outerStride.value() + col, x); + (const_cast(m_d.data) + row * m_d.outerStride() + col, x); else return pstoret - (const_cast(m_data) + row + col * m_outerStride.value(), x); + (const_cast(m_d.data) + row + col * m_d.outerStride(), x); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret(const_cast(m_data) + index, x); + return pstoret(const_cast(m_d.data) + index, x); } protected: - const Scalar *m_data; - // We do not need to know the outer stride for vectors - variable_if_dynamic m_outerStride; + plainobjectbase_evaluator_data m_d; }; template @@ -237,11 +274,13 @@ struct evaluator > : evaluator > > { typedef Matrix XprType; - - EIGEN_DEVICE_FUNC evaluator() {} - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -251,10 +290,12 @@ struct evaluator > { typedef Array XprType; - EIGEN_DEVICE_FUNC evaluator() {} - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) - : evaluator >(m) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + evaluator() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& m) + : evaluator >(m) { } }; @@ -265,14 +306,15 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef Transpose XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost, + CoeffReadCost = evaluator::CoeffReadCost, Flags = evaluator::Flags ^ RowMajorBit, Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -457,10 +499,10 @@ struct evaluator > { typedef CwiseNullaryOp XprType; typedef typename internal::remove_all::type PlainObjectTypeCleaned; - + enum { CoeffReadCost = internal::functor_traits::Cost, - + Flags = (evaluator::Flags & ( HereditaryBits | (functor_has_linear_access::ret ? LinearAccessBit : 0) @@ -517,19 +559,17 @@ struct unary_evaluator, IndexBased > : evaluator_base > { typedef CwiseUnaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = evaluator::Flags & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), Alignment = evaluator::Alignment }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& op) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression()) + explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -540,32 +580,43 @@ struct unary_evaluator, IndexBased > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_argImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.argImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_argImpl.template packet(index)); + return m_d.func().packetOp(m_d.argImpl.template packet(index)); } protected: - const UnaryOp m_functor; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- CwiseTernaryOp -------------------- @@ -577,7 +628,7 @@ struct evaluator > { typedef CwiseTernaryOp XprType; typedef ternary_evaluator > Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} }; @@ -586,10 +637,10 @@ struct ternary_evaluator, IndexBased : evaluator_base > { typedef CwiseTernaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Arg1Flags = evaluator::Flags, Arg2Flags = evaluator::Flags, Arg3Flags = evaluator::Flags, @@ -609,11 +660,7 @@ struct ternary_evaluator, IndexBased evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_arg1Impl(xpr.arg1()), - m_arg2Impl(xpr.arg2()), - m_arg3Impl(xpr.arg3()) + EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -624,38 +671,48 @@ struct ternary_evaluator, IndexBased EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col)); + return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_arg1Impl.template packet(row, col), - m_arg2Impl.template packet(row, col), - m_arg3Impl.template packet(row, col)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(row, col), + m_d.arg2Impl.template packet(row, col), + m_d.arg3Impl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); + return m_d.func().packetOp(m_d.arg1Impl.template packet(index), + m_d.arg2Impl.template packet(index), + m_d.arg3Impl.template packet(index)); } protected: - const TernaryOp m_functor; - evaluator m_arg1Impl; - evaluator m_arg2Impl; - evaluator m_arg3Impl; + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TernaryOp& func() const { return op; } + TernaryOp op; + evaluator arg1Impl; + evaluator arg2Impl; + evaluator arg3Impl; + }; + + Data m_d; }; // -------------------- CwiseBinaryOp -------------------- @@ -667,8 +724,9 @@ struct evaluator > { typedef CwiseBinaryOp XprType; typedef binary_evaluator > Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& xpr) : Base(xpr) {} }; template @@ -676,10 +734,10 @@ struct binary_evaluator, IndexBased, IndexBase : evaluator_base > { typedef CwiseBinaryOp XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + LhsFlags = evaluator::Flags, RhsFlags = evaluator::Flags, SameType = is_same::value, @@ -696,10 +754,8 @@ struct binary_evaluator, IndexBased, IndexBase Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) - : m_functor(xpr.functor()), - m_lhsImpl(xpr.lhs()), - m_rhsImpl(xpr.rhs()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit binary_evaluator(const XprType& xpr) : m_d(xpr) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -710,35 +766,46 @@ struct binary_evaluator, IndexBased, IndexBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); + return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); + return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_functor.packetOp(m_lhsImpl.template packet(row, col), - m_rhsImpl.template packet(row, col)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(row, col), + m_d.rhsImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_functor.packetOp(m_lhsImpl.template packet(index), - m_rhsImpl.template packet(index)); + return m_d.func().packetOp(m_d.lhsImpl.template packet(index), + m_d.rhsImpl.template packet(index)); } protected: - const BinaryOp m_functor; - evaluator m_lhsImpl; - evaluator m_rhsImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const BinaryOp& func() const { return op; } + BinaryOp op; + evaluator lhsImpl; + evaluator rhsImpl; + }; + + Data m_d; }; // -------------------- CwiseUnaryView -------------------- @@ -748,18 +815,16 @@ struct unary_evaluator, IndexBased> : evaluator_base > { typedef CwiseUnaryView XprType; - + enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, - + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), - + Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost... }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) - : m_unaryOp(op.functor()), - m_argImpl(op.nestedExpression()) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -771,30 +836,41 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_unaryOp(m_argImpl.coeff(row, col)); + return m_d.func()(m_d.argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_unaryOp(m_argImpl.coeff(index)); + return m_d.func()(m_d.argImpl.coeff(index)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { - return m_unaryOp(m_argImpl.coeffRef(row, col)); + return m_d.func()(m_d.argImpl.coeffRef(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return m_unaryOp(m_argImpl.coeffRef(index)); + return m_d.func()(m_d.argImpl.coeffRef(index)); } protected: - const UnaryOp m_unaryOp; - evaluator m_argImpl; + + // this helper permits to completely eliminate the functor if it is empty + struct Data + { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& func() const { return op; } + UnaryOp op; + evaluator argImpl; + }; + + Data m_d; }; // -------------------- Map -------------------- @@ -811,14 +887,15 @@ struct mapbase_evaluator : evaluator_base typedef typename XprType::PointerType PointerType; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { IsRowMajor = XprType::RowsAtCompileTime, ColsAtCompileTime = XprType::ColsAtCompileTime, CoeffReadCost = NumTraits::ReadCost }; - EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit mapbase_evaluator(const XprType& map) : m_data(const_cast(map.data())), m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) @@ -882,17 +959,21 @@ struct mapbase_evaluator : evaluator_base internal::pstoret(m_data + index * m_innerStride.value(), x); } protected: - EIGEN_DEVICE_FUNC - inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC - inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colStride() const EIGEN_NOEXCEPT { + return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); + } PointerType m_data; const internal::variable_if_dynamic m_innerStride; const internal::variable_if_dynamic m_outerStride; }; -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { @@ -900,7 +981,7 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) @@ -912,34 +993,35 @@ struct evaluator > HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0, HasNoStride = HasNoInnerStride && HasNoOuterStride, IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, - + PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit), LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit), Flags = int( evaluator::Flags) & (LinearAccessMask&PacketAccessMask), - + Alignment = int(MapOptions)&int(AlignedMask) }; EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) - : mapbase_evaluator(map) + : mapbase_evaluator(map) { } }; // -------------------- Ref -------------------- -template +template struct evaluator > : public mapbase_evaluator, PlainObjectType> { typedef Ref XprType; - + enum { Flags = evaluator >::Flags, Alignment = evaluator >::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) - : mapbase_evaluator(ref) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& ref) + : mapbase_evaluator(ref) { } }; @@ -947,8 +1029,8 @@ struct evaluator > template::ret> struct block_evaluator; - -template + +template struct evaluator > : block_evaluator { @@ -956,15 +1038,15 @@ struct evaluator > typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types typedef typename packet_traits::type PacketScalar; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + RowsAtCompileTime = traits::RowsAtCompileTime, ColsAtCompileTime = traits::ColsAtCompileTime, MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = traits::MaxColsAtCompileTime, - + ArgTypeIsRowMajor = (int(evaluator::Flags)&RowMajorBit) != 0, IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1 : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -977,21 +1059,24 @@ struct evaluator > OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, + + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | DirectAccessBit | MaskPacketAccessBit), Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, - + PacketAlignment = unpacket_traits::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& block) : block_evaluator_type(block) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1004,8 +1089,9 @@ struct block_evaluator XprType; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : unary_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : unary_evaluator(block) {} }; @@ -1015,84 +1101,116 @@ struct unary_evaluator, IndexBa { typedef Block XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) - : m_argImpl(block.nestedExpression()), - m_startRow(block.startRow()), - m_startCol(block.startCol()) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& block) + : m_argImpl(block.nestedExpression()), + m_startRow(block.startRow()), + m_startCol(block.startCol()), + m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { - RowsAtCompileTime = XprType::RowsAtCompileTime + RowsAtCompileTime = XprType::RowsAtCompileTime, + ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator::Flags&LinearAccessBit) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const - { - return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + { + return linear_coeff_impl(index, bool_constant()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) - { - return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); + { + return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + { + return linear_coeffRef_impl(index, bool_constant()); } - + template EIGEN_STRONG_INLINE - PacketType packet(Index row, Index col) const - { - return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); } template EIGEN_STRONG_INLINE - PacketType packet(Index index) const - { - return packet(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); + PacketType packet(Index index) const + { + if (ForwardLinearAccess) + return m_argImpl.template packet(m_linear_offset.value() + index); + else + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); } - + template EIGEN_STRONG_INLINE - void writePacket(Index row, Index col, const PacketType& x) + void writePacket(Index row, Index col, const PacketType& x) { - return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); + return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); } - + template EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketType& x) + void writePacket(Index index, const PacketType& x) { - return writePacket(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0, - x); + if (ForwardLinearAccess) + return m_argImpl.template writePacket(m_linear_offset.value() + index, x); + else + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); } - + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const + { + return m_argImpl.coeff(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const + { + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */) + { + return m_argImpl.coeffRef(m_linear_offset.value() + index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */) + { + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; + const variable_if_dynamic m_linear_offset; }; -// TODO: This evaluator does not actually use the child evaluator; +// TODO: This evaluator does not actually use the child evaluator; // all action is via the data() as returned by the Block expression. -template +template struct block_evaluator : mapbase_evaluator, typename Block::PlainObject> @@ -1100,8 +1218,9 @@ struct block_evaluator XprType; typedef typename XprType::Scalar Scalar; - EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) - : mapbase_evaluator(block) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit block_evaluator(const XprType& block) + : mapbase_evaluator(block) { // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); @@ -1124,18 +1243,19 @@ struct evaluator > evaluator::CoeffReadCost), Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, - + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1155,7 +1275,7 @@ struct evaluator > else return m_elseImpl.coeff(index); } - + protected: evaluator m_conditionImpl; evaluator m_thenImpl; @@ -1165,7 +1285,7 @@ struct evaluator > // -------------------- Replicate -------------------- -template +template struct unary_evaluator > : evaluator_base > { @@ -1176,22 +1296,23 @@ struct unary_evaluator > }; typedef typename internal::nested_eval::type ArgTypeNested; typedef typename internal::remove_all::type ArgTypeNestedCleaned; - + enum { CoeffReadCost = evaluator::CoeffReadCost, LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0, Flags = (evaluator::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits::Flags & RowMajorBit), - + Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& replicate) : m_arg(replicate.nestedExpression()), m_argImpl(m_arg), m_rows(replicate.nestedExpression().rows()), m_cols(replicate.nestedExpression().cols()) {} - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1202,10 +1323,10 @@ struct unary_evaluator > const Index actual_col = internal::traits::ColsAtCompileTime==1 ? 0 : ColFactor==1 ? col : col % m_cols.value(); - + return m_argImpl.coeff(actual_row, actual_col); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { @@ -1213,7 +1334,7 @@ struct unary_evaluator > const Index actual_index = internal::traits::RowsAtCompileTime==1 ? (ColFactor==1 ? index : index%m_cols.value()) : (RowFactor==1 ? index : index%m_rows.value()); - + return m_argImpl.coeff(actual_index); } @@ -1230,7 +1351,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_row, actual_col); } - + template EIGEN_STRONG_INLINE PacketType packet(Index index) const @@ -1241,7 +1362,7 @@ struct unary_evaluator > return m_argImpl.template packet(actual_index); } - + protected: const ArgTypeNested m_arg; evaluator m_argImpl; @@ -1249,64 +1370,6 @@ struct unary_evaluator > const variable_if_dynamic m_cols; }; - -// -------------------- PartialReduxExpr -------------------- - -template< typename ArgType, typename MemberOp, int Direction> -struct evaluator > - : evaluator_base > -{ - typedef PartialReduxExpr XprType; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; - typedef typename ArgType::Scalar InputScalar; - typedef typename XprType::Scalar Scalar; - enum { - TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) - }; - typedef typename MemberOp::template Cost CostOpType; - enum { - CoeffReadCost = TraversalSize==Dynamic ? HugeCost - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - - Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, - - Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized - }; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) - : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index i, Index j) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(j)); - else - return m_functor(m_arg.row(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index index) const - { - if (Direction==Vertical) - return m_functor(m_arg.col(index)); - else - return m_functor(m_arg.row(index)); - } - -protected: - typename internal::add_const_on_value_type::type m_arg; - const MemberOp m_functor; -}; - - // -------------------- MatrixWrapper and ArrayWrapper -------------------- // // evaluator_wrapper_base is a common base class for the @@ -1323,7 +1386,8 @@ struct evaluator_wrapper_base Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; @@ -1390,7 +1454,8 @@ struct unary_evaluator > { typedef MatrixWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1401,7 +1466,8 @@ struct unary_evaluator > { typedef ArrayWrapper XprType; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1428,9 +1494,9 @@ struct unary_evaluator > ReversePacket = (Direction == BothDirections) || ((Direction == Vertical) && IsColMajor) || ((Direction == Horizontal) && IsRowMajor), - + CoeffReadCost = evaluator::CoeffReadCost, - + // let's enable LinearAccess only with vectorization because of the product overhead // FIXME enable DirectAccess with negative strides? Flags0 = evaluator::Flags, @@ -1439,16 +1505,17 @@ struct unary_evaluator > ? LinearAccessBit : 0, Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess), - + Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f. }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& reverse) : m_argImpl(reverse.nestedExpression()), m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { @@ -1523,7 +1590,7 @@ struct unary_evaluator > m_argImpl.template writePacket (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); } - + protected: evaluator m_argImpl; @@ -1541,20 +1608,21 @@ struct evaluator > : evaluator_base > { typedef Diagonal XprType; - + enum { CoeffReadCost = evaluator::CoeffReadCost, - + Flags = (unsigned int)(evaluator::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit, - + Alignment = 0 }; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit evaluator(const XprType& diagonal) : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) { } - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -1587,8 +1655,10 @@ struct evaluator > const internal::variable_if_dynamicindex m_index; private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } }; @@ -1612,25 +1682,25 @@ class EvalToTemp : public dense_xpr_base >::type { public: - + typedef typename dense_xpr_base::type Base; EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp) - + explicit EvalToTemp(const ArgType& arg) : m_arg(arg) { } - + const ArgType& arg() const { return m_arg; } - Index rows() const + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_arg.rows(); } - Index cols() const + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_arg.cols(); } @@ -1638,7 +1708,7 @@ class EvalToTemp private: const ArgType& m_arg; }; - + template struct evaluator > : public evaluator @@ -1646,7 +1716,7 @@ struct evaluator > typedef EvalToTemp XprType; typedef typename ArgType::PlainObject PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) { diff --git a/externals/eigen/Eigen/src/Core/CoreIterators.h b/externals/eigen/Eigen/src/Core/CoreIterators.h index 4eb42b93..b9671968 100644 --- a/externals/eigen/Eigen/src/Core/CoreIterators.h +++ b/externals/eigen/Eigen/src/Core/CoreIterators.h @@ -48,6 +48,11 @@ class InnerIterator * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView */ EIGEN_STRONG_INLINE InnerIterator& operator++() { m_iter.operator++(); return *this; } + EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; } + EIGEN_STRONG_INLINE InnerIterator operator+(Index i) + { InnerIterator result(*this); result+=i; return result; } + + /// \returns the column or row index of the current coefficient. EIGEN_STRONG_INLINE Index index() const { return m_iter.index(); } /// \returns the row index of the current coefficient. diff --git a/externals/eigen/Eigen/src/Core/CwiseBinaryOp.h b/externals/eigen/Eigen/src/Core/CwiseBinaryOp.h index a36765e3..2202b1cc 100644 --- a/externals/eigen/Eigen/src/Core/CwiseBinaryOp.h +++ b/externals/eigen/Eigen/src/Core/CwiseBinaryOp.h @@ -74,7 +74,7 @@ class CwiseBinaryOpImpl; * \sa MatrixBase::binaryExpr(const MatrixBase &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp */ template -class CwiseBinaryOp : +class CwiseBinaryOp : public CwiseBinaryOpImpl< BinaryOp, LhsType, RhsType, typename internal::cwise_promote_storage_type::StorageKind, @@ -83,7 +83,7 @@ class CwiseBinaryOp : internal::no_assignment_operator { public: - + typedef typename internal::remove_all::type Functor; typedef typename internal::remove_all::type Lhs; typedef typename internal::remove_all::type Rhs; @@ -100,8 +100,14 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) +#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 + //Required for Visual Studio or the Copy constructor will probably not get inlined! + EIGEN_STRONG_INLINE + CwiseBinaryOp(const CwiseBinaryOp&) = default; +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); @@ -110,31 +116,25 @@ class CwiseBinaryOp : eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::RowsAtCompileTime==Dynamic) - return m_rhs.rows(); - else - return m_lhs.rows(); + return internal::traits::type>::RowsAtCompileTime==Dynamic ? m_rhs.rows() : m_lhs.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { // return the fixed size type if available to enable compile time optimizations - if (internal::traits::type>::ColsAtCompileTime==Dynamic) - return m_rhs.cols(); - else - return m_lhs.cols(); + return internal::traits::type>::ColsAtCompileTime==Dynamic ? m_rhs.cols() : m_lhs.cols(); } /** \returns the left hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; } protected: @@ -158,7 +158,7 @@ class CwiseBinaryOpImpl */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/externals/eigen/Eigen/src/Core/CwiseNullaryOp.h b/externals/eigen/Eigen/src/Core/CwiseNullaryOp.h index dd498f75..289ec510 100644 --- a/externals/eigen/Eigen/src/Core/CwiseNullaryOp.h +++ b/externals/eigen/Eigen/src/Core/CwiseNullaryOp.h @@ -74,10 +74,10 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -126,12 +131,17 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f * * Here is an example with C++11 random generators: \include random_cpp11.cpp * Output: \verbinclude random_cpp11.out - * + * * \sa class CwiseNullaryOp */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -150,7 +160,12 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const CwiseNullaryOp::PlainObject> +#else +const CwiseNullaryOp +#endif DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -170,7 +185,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); @@ -192,7 +207,7 @@ DenseBase::Constant(Index rows, Index cols, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index size, const Scalar& value) { return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); @@ -208,7 +223,7 @@ DenseBase::Constant(Index size, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(const Scalar& value) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -217,27 +232,32 @@ DenseBase::Constant(const Scalar& value) /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&) * - * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) + * \only_for_vectors + * + * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp + * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out + * + * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&) * - * \sa LinSpaced(Scalar,Scalar) + * \sa LinSpaced(const Scalar&, const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** @@ -264,11 +284,11 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** @@ -276,17 +296,17 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) * Special version for fixed size types which does not require the size parameter. */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template -bool DenseBase::isApproxToConstant +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); @@ -301,7 +321,7 @@ bool DenseBase::isApproxToConstant * * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ template -bool DenseBase::isConstant +EIGEN_DEVICE_FUNC bool DenseBase::isConstant (const Scalar& val, const RealScalar& prec) const { return isApproxToConstant(val, prec); @@ -312,7 +332,7 @@ bool DenseBase::isConstant * \sa setConstant(), Constant(), class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) { setConstant(val); } @@ -322,7 +342,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) { return derived() = Constant(rows(), cols(), val); } @@ -337,7 +357,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index size, const Scalar& val) { resize(size); @@ -356,13 +376,40 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) { resize(rows, cols); return setConstant(val); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(NoChange_t, Index cols, const Scalar& val) +{ + return setConstant(rows(), cols, val); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to the given value \a val. For the parameter + * of type NoChange_t, just pass the special value \c NoChange. + * + * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(Index rows, NoChange_t, const Scalar& val) +{ + return setConstant(rows, cols(), val); +} + + /** * \brief Sets a linearly spaced vector. * @@ -380,10 +427,10 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); } /** @@ -400,7 +447,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return setLinSpaced(size(), low, high); @@ -423,7 +470,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, * \sa Zero(), Zero(Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index rows, Index cols) { return Constant(rows, cols, Scalar(0)); @@ -446,7 +493,7 @@ DenseBase::Zero(Index rows, Index cols) * \sa Zero(), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index size) { return Constant(size, Scalar(0)); @@ -463,7 +510,7 @@ DenseBase::Zero(Index size) * \sa Zero(Index), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero() { return Constant(Scalar(0)); @@ -478,7 +525,7 @@ DenseBase::Zero() * \sa class CwiseNullaryOp, Zero() */ template -bool DenseBase::isZero(const RealScalar& prec) const +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); for(Index j = 0; j < cols(); ++j) @@ -496,7 +543,7 @@ bool DenseBase::isZero(const RealScalar& prec) const * \sa class CwiseNullaryOp, Zero() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() { return setConstant(Scalar(0)); } @@ -511,7 +558,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setZero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index newSize) { resize(newSize); @@ -529,13 +576,39 @@ PlainObjectBase::setZero(Index newSize) * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index rows, Index cols) { resize(rows, cols); return setConstant(Scalar(0)); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(NoChange_t, Index cols) +{ + return setZero(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to zero. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Zero() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(Index rows, NoChange_t) +{ + return setZero(rows, cols()); +} + // ones: /** \returns an expression of a matrix where all coefficients equal one. @@ -553,7 +626,7 @@ PlainObjectBase::setZero(Index rows, Index cols) * \sa Ones(), Ones(Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index rows, Index cols) { return Constant(rows, cols, Scalar(1)); @@ -576,7 +649,7 @@ DenseBase::Ones(Index rows, Index cols) * \sa Ones(), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index newSize) { return Constant(newSize, Scalar(1)); @@ -593,7 +666,7 @@ DenseBase::Ones(Index newSize) * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones() { return Constant(Scalar(1)); @@ -608,7 +681,7 @@ DenseBase::Ones() * \sa class CwiseNullaryOp, Ones() */ template -bool DenseBase::isOnes +EIGEN_DEVICE_FUNC bool DenseBase::isOnes (const RealScalar& prec) const { return isApproxToConstant(Scalar(1), prec); @@ -622,7 +695,7 @@ bool DenseBase::isOnes * \sa class CwiseNullaryOp, Ones() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() { return setConstant(Scalar(1)); } @@ -637,7 +710,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index newSize) { resize(newSize); @@ -655,13 +728,39 @@ PlainObjectBase::setOnes(Index newSize) * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index rows, Index cols) { resize(rows, cols); return setConstant(Scalar(1)); } +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(Index rows, NoChange_t) +{ + return setOnes(rows, cols()); +} + +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to one. For the parameter of type NoChange_t, + * just pass the special value \c NoChange. + * + * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp, MatrixBase::Ones() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(NoChange_t, Index cols) +{ + return setOnes(rows(), cols); +} + // Identity: /** \returns an expression of the identity matrix (not necessarily square). @@ -679,7 +778,7 @@ PlainObjectBase::setOnes(Index rows, Index cols) * \sa Identity(), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity(Index rows, Index cols) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); @@ -696,7 +795,7 @@ MatrixBase::Identity(Index rows, Index cols) * \sa Identity(Index,Index), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity() { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -771,7 +870,7 @@ struct setIdentity_impl * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() { return internal::setIdentity_impl::run(derived()); } @@ -787,7 +886,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) { derived().resize(rows, cols); return setIdentity(); @@ -800,7 +899,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); @@ -815,7 +914,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(),i); @@ -828,7 +927,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() { return Derived::Unit(0); } /** \returns an expression of the Y axis unit vector (0,1{,0}^*) @@ -838,7 +937,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() { return Derived::Unit(1); } /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) @@ -848,7 +947,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() { return Derived::Unit(2); } /** \returns an expression of the W axis unit vector (0,0,0,1) @@ -858,9 +957,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i @@ -24,7 +24,7 @@ struct traits > typedef typename XprType::Nested XprTypeNested; typedef typename remove_reference::type _XprTypeNested; enum { - Flags = _XprTypeNested::Flags & RowMajorBit + Flags = _XprTypeNested::Flags & RowMajorBit }; }; } @@ -65,10 +65,10 @@ class CwiseUnaryOp : public CwiseUnaryOpImpl::non_const_type MatrixTypeNested; typedef typename internal::remove_all::type NestedExpression; - explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) + explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp()) : m_matrix(mat), m_functor(func) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView) - EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); } - EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns the functor representing unary operation */ - const ViewOp& functor() const { return m_functor; } + EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; } /** \returns the nested expression */ - const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - typename internal::remove_reference::type& - nestedExpression() { return m_matrix.const_cast_derived(); } + EIGEN_DEVICE_FUNC typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } protected: MatrixTypeNested m_matrix; @@ -108,19 +110,21 @@ class CwiseUnaryViewImpl EIGEN_DENSE_PUBLIC_INTERFACE(Derived) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl) - + EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); } EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); } - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return derived().nestedExpression().innerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl) }; } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/DenseBase.h b/externals/eigen/Eigen/src/Core/DenseBase.h index 46fe5193..9b16db68 100644 --- a/externals/eigen/Eigen/src/Core/DenseBase.h +++ b/externals/eigen/Eigen/src/Core/DenseBase.h @@ -14,15 +14,15 @@ namespace Eigen { namespace internal { - + // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type. // This dummy function simply aims at checking that at compile time. static inline void check_DenseIndex_is_signed() { - EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE) } } // end namespace internal - + /** \class DenseBase * \ingroup Core_Module * @@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() { */ template class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN - : public DenseCoeffsBase + : public DenseCoeffsBase::value> #else : public DenseCoeffsBase #endif // not EIGEN_PARSED_BY_DOXYGEN @@ -64,14 +64,14 @@ template class DenseBase /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. */ typedef typename internal::traits::Scalar Scalar; - + /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex, etc. * * It is an alias for the Scalar type */ typedef Scalar value_type; - + typedef typename NumTraits::Real RealScalar; - typedef DenseCoeffsBase Base; + typedef DenseCoeffsBase::value> Base; using Base::derived; using Base::const_cast_derived; @@ -150,13 +150,18 @@ template class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits::RowsAtCompileTime == 1 + || internal::traits::ColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -170,11 +175,11 @@ template class DenseBase InnerStrideAtCompileTime = internal::inner_stride_at_compile_time::ret, OuterStrideAtCompileTime = internal::outer_stride_at_compile_time::ret }; - + typedef typename internal::find_best_packet::type PacketScalar; enum { IsPlainObjectBase = 0 }; - + /** The plain matrix type corresponding to this expression. * \sa PlainObject */ typedef Matrix::Scalar, @@ -184,7 +189,7 @@ template class DenseBase internal::traits::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime > PlainMatrix; - + /** The plain array type corresponding to this expression. * \sa PlainObject */ typedef Array::Scalar, @@ -206,7 +211,7 @@ template class DenseBase /** \returns the number of nonzero coefficients which is in practice the number * of stored coefficients. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index nonZeros() const { return size(); } /** \returns the outer size. @@ -214,7 +219,7 @@ template class DenseBase * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a * column-major matrix, and the number of rows for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index outerSize() const { return IsVectorAtCompileTime ? 1 @@ -224,9 +229,9 @@ template class DenseBase /** \returns the inner size. * * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension - * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a + * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a * column-major matrix, and the number of columns for a row-major matrix. */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index innerSize() const { return IsVectorAtCompileTime ? this->size() @@ -261,9 +266,9 @@ template class DenseBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */ - typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + EIGEN_DEPRECATED typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ - typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; @@ -296,18 +301,18 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue& func); - /** \ínternal - * Copies \a other into *this without evaluating other. \returns a reference to *this. - * \deprecated */ + /** \internal + * Copies \a other into *this without evaluating other. \returns a reference to *this. */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase& other); EIGEN_DEVICE_FUNC CommaInitializer operator<< (const Scalar& s); - /** \deprecated it now returns \c *this */ template + /** \deprecated it now returns \c *this */ EIGEN_DEPRECATED const Derived& flagged() const { return derived(); } @@ -332,12 +337,13 @@ template class DenseBase EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high); + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low, const Scalar& high); - EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType - LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high); @@ -369,7 +375,7 @@ template class DenseBase template EIGEN_DEVICE_FUNC bool isApprox(const DenseBase& other, const RealScalar& prec = NumTraits::dummy_precision()) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other, const RealScalar& prec = NumTraits::dummy_precision()) const; template EIGEN_DEVICE_FUNC @@ -380,7 +386,7 @@ template class DenseBase EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; - + inline bool hasNaN() const; inline bool allFinite() const; @@ -394,8 +400,8 @@ template class DenseBase * * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. - * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const @@ -405,12 +411,12 @@ template class DenseBase // size types on MSVC. return typename internal::eval::type(derived()); } - + /** swaps *this with the expression \a other. * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(const DenseBase& other) { EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); @@ -422,7 +428,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(PlainObjectBase& other) { eigen_assert(rows()==other.rows() && cols()==other.cols()); @@ -443,18 +449,58 @@ template class DenseBase EIGEN_DEVICE_FUNC Scalar prod() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff() const; + template EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff() const; - template EIGEN_DEVICE_FUNC + + // By default, the fastest version with undefined NaN propagation semantics is + // used. + // TODO(rmlarsen): Replace with default template argument when we move to + // c++11 or beyond. + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar minCoeff() const { + return minCoeff(); + } + EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar maxCoeff() const { + return maxCoeff(); + } + + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff(IndexType* index) const; - template EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff(IndexType* index) const; + // TODO(rmlarsen): Replace these methods with a default template argument. + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const { + return minCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const { + return maxCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar minCoeff(IndexType* index) const { + return minCoeff(index); + } + template + EIGEN_DEVICE_FUNC inline + typename internal::traits::Scalar maxCoeff(IndexType* index) const { + return maxCoeff(index); + } + template EIGEN_DEVICE_FUNC Scalar redux(const BinaryOp& func) const; @@ -484,16 +530,16 @@ template class DenseBase return derived().coeff(0,0); } - bool all() const; - bool any() const; - Index count() const; + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; typedef VectorwiseOp RowwiseReturnType; typedef const VectorwiseOp ConstRowwiseReturnType; typedef VectorwiseOp ColwiseReturnType; typedef const VectorwiseOp ConstColwiseReturnType; - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions * * Example: \include MatrixBase_rowwise.cpp * Output: \verbinclude MatrixBase_rowwise.out @@ -506,7 +552,7 @@ template class DenseBase } EIGEN_DEVICE_FUNC RowwiseReturnType rowwise(); - /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations + /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out @@ -524,16 +570,16 @@ template class DenseBase static const RandomReturnType Random(); template - const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const; template - inline const Select + inline EIGEN_DEVICE_FUNC const Select select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const; template RealScalar lpNorm() const; @@ -567,16 +613,59 @@ template class DenseBase } EIGEN_DEVICE_FUNC void reverseInPlace(); + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type as returned by the begin() and end() methods. + */ + typedef random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + typedef random_access_iterator_type const_iterator; + #else + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type iterator_type; + + typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, + internal::pointer_based_stl_iterator, + internal::generic_randaccess_stl_iterator + >::type const_iterator_type; + + // Stl-style iterators are supported only for vectors. + + typedef typename internal::conditional< IsVectorAtCompileTime, + iterator_type, + void + >::type iterator; + + typedef typename internal::conditional< IsVectorAtCompileTime, + const_iterator_type, + void + >::type const_iterator; + #endif + + inline iterator begin(); + inline const_iterator begin() const; + inline const_iterator cbegin() const; + inline iterator end(); + inline const_iterator end() const; + inline const_iterator cend() const; + #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) +#define EIGEN_DOC_UNARY_ADDONS(X,Y) +# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/BlockMethods.h" +# include "../plugins/IndexedViewMethods.h" +# include "../plugins/ReshapedMethods.h" # ifdef EIGEN_DENSEBASE_PLUGIN # include EIGEN_DENSEBASE_PLUGIN # endif #undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF +#undef EIGEN_DOC_UNARY_ADDONS // disable the use of evalTo for dense objects with a nice compilation error template @@ -587,11 +676,12 @@ template class DenseBase } protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase) /** Default constructor. Do nothing. */ EIGEN_DEVICE_FUNC DenseBase() { /* Just checks for self-consistency of the flags. - * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down + * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down */ #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) diff --git a/externals/eigen/Eigen/src/Core/DenseCoeffsBase.h b/externals/eigen/Eigen/src/Core/DenseCoeffsBase.h index c4af48ab..37fcdb59 100644 --- a/externals/eigen/Eigen/src/Core/DenseCoeffsBase.h +++ b/externals/eigen/Eigen/src/Core/DenseCoeffsBase.h @@ -22,11 +22,12 @@ template struct add_const_on_value_type_if_arithmetic /** \brief Base class providing read-only coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #ReadOnlyAccessors Constant indicating read-only access + * + * \note #ReadOnlyAccessors Constant indicating read-only access * * This class defines the \c operator() \c const function and friends, which can be used to read specific * entries of a matrix or array. - * + * * \sa DenseCoeffsBase, DenseCoeffsBase, * \ref TopicClassHierarchy */ @@ -288,12 +289,13 @@ class DenseCoeffsBase : public EigenBase /** \brief Base class providing read/write coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * \tparam #WriteAccessors Constant indicating read/write access + * + * \note #WriteAccessors Constant indicating read/write access * * This class defines the non-const \c operator() function and friends, which can be used to write specific * entries of a matrix or array. This class inherits DenseCoeffsBase which * defines the const variant for reading specific entries. - * + * * \sa DenseCoeffsBase, \ref TopicClassHierarchy */ template @@ -466,7 +468,8 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read-only using @@ -492,7 +495,7 @@ class DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read/write using @@ -566,8 +570,8 @@ class DenseCoeffsBase * * \sa outerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } @@ -577,14 +581,14 @@ class DenseCoeffsBase * * \sa innerStride(), rowStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } // FIXME shall we remove it ? - inline Index stride() const + EIGEN_CONSTEXPR inline Index stride() const EIGEN_NOEXCEPT { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); } @@ -593,8 +597,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), colStride() */ - EIGEN_DEVICE_FUNC - inline Index rowStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rowStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? outerStride() : innerStride(); } @@ -603,8 +607,8 @@ class DenseCoeffsBase * * \sa innerStride(), outerStride(), rowStride() */ - EIGEN_DEVICE_FUNC - inline Index colStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index colStride() const EIGEN_NOEXCEPT { return Derived::IsRowMajor ? innerStride() : outerStride(); } @@ -615,7 +619,7 @@ namespace internal { template struct first_aligned_impl { - static inline Index run(const Derived&) + static EIGEN_CONSTEXPR inline Index run(const Derived&) EIGEN_NOEXCEPT { return 0; } }; diff --git a/externals/eigen/Eigen/src/Core/DenseStorage.h b/externals/eigen/Eigen/src/Core/DenseStorage.h index 7958feeb..08ef6c53 100644 --- a/externals/eigen/Eigen/src/Core/DenseStorage.h +++ b/externals/eigen/Eigen/src/Core/DenseStorage.h @@ -47,21 +47,21 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array() - { + { check_static_allocation_size(); } EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) - { + { check_static_allocation_size(); } }; #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) -#elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. +#elif EIGEN_GNUC_AT_LEAST(4,7) + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template @@ -85,15 +85,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -104,15 +104,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -123,15 +123,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() + plain_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -142,15 +142,15 @@ struct plain_array EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; EIGEN_DEVICE_FUNC - plain_array() - { + plain_array() + { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } EIGEN_DEVICE_FUNC - plain_array(constructor_without_unaligned_array_assert) - { + plain_array(constructor_without_unaligned_array_assert) + { check_static_allocation_size(); } }; @@ -163,6 +163,30 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; +struct plain_array_helper { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void copy(const plain_array& src, const Eigen::Index size, + plain_array& dst) { + smart_copy(src.array, src.array + size, dst.array); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void swap(plain_array& a, const Eigen::Index a_size, + plain_array& b, const Eigen::Index b_size) { + if (a_size < b_size) { + std::swap_ranges(b.array, b.array + a_size, a.array); + smart_move(b.array + a_size, b.array + b_size, a.array + a_size); + } else if (a_size > b_size) { + std::swap_ranges(a.array, a.array + b_size, b.array); + smart_move(a.array + b_size, a.array + a_size, b.array + b_size); + } else { + std::swap_ranges(a.array, a.array + a_size, b.array); + } + } +}; + } // end namespace internal /** \internal @@ -190,16 +214,41 @@ template class DenseSt EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} - EIGEN_DEVICE_FUNC +#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } - EIGEN_DEVICE_FUNC +#else + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; +#endif +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) - { + { if (this != &other) m_data = other.m_data; - return *this; + return *this; + } +#else + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; +#endif +#if EIGEN_HAS_RVALUE_REFERENCES +#if !EIGEN_HAS_CXX11 + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + { } + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + if (this != &other) + m_data = std::move(other.m_data); + return *this; + } +#else + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; +#endif +#endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); @@ -207,9 +256,11 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) EIGEN_NOEXCEPT {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } @@ -226,8 +277,8 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*m_cols); @@ -404,7 +482,7 @@ template class DenseStorage(m_data, m_rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; @@ -446,7 +524,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -459,16 +537,18 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } - EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} - EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index rows(void) EIGEN_NOEXCEPT {return _Rows;} + EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, _Rows*m_cols); @@ -479,7 +559,7 @@ template class DenseStorage(m_data, _Rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; @@ -520,7 +600,7 @@ template class DenseStorageswap(tmp); } return *this; - } + } #if EIGEN_HAS_RVALUE_REFERENCES EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT @@ -533,16 +613,18 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } - EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} - EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } + EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;} + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) { m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*_Cols); @@ -553,7 +635,7 @@ template class DenseStorage(m_data, _Cols*m_rows); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; diff --git a/externals/eigen/Eigen/src/Core/Diagonal.h b/externals/eigen/Eigen/src/Core/Diagonal.h index 49e71125..3112d2c1 100644 --- a/externals/eigen/Eigen/src/Core/Diagonal.h +++ b/externals/eigen/Eigen/src/Core/Diagonal.h @@ -11,7 +11,7 @@ #ifndef EIGEN_DIAGONAL_H #define EIGEN_DIAGONAL_H -namespace Eigen { +namespace Eigen { /** \class Diagonal * \ingroup Core_Module @@ -70,7 +70,10 @@ template class Diagonal EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DEVICE_FUNC - explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} + explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) + { + eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() ); + } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) @@ -81,20 +84,16 @@ template class Diagonal : numext::mini(m_matrix.rows(),m_matrix.cols()-m_index.value()); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return 1; } - EIGEN_DEVICE_FUNC - inline Index innerStride() const - { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride() + 1; } - EIGEN_DEVICE_FUNC - inline Index outerStride() const - { - return 0; - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return 0; } typedef typename internal::conditional< internal::is_lvalue::value, @@ -146,8 +145,8 @@ template class Diagonal } EIGEN_DEVICE_FUNC - inline const typename internal::remove_all::type& - nestedExpression() const + inline const typename internal::remove_all::type& + nestedExpression() const { return m_matrix; } @@ -164,12 +163,12 @@ template class Diagonal private: // some compilers may fail to optimize std::max etc in case of compile-time constants... - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index absDiagIndex() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rowOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index colOffset() const EIGEN_NOEXCEPT { return m_index.value()>0 ? m_index.value() : 0; } // trigger a compile-time error if someone try to call packet template typename MatrixType::PacketReturnType packet(Index) const; template typename MatrixType::PacketReturnType packet(Index,Index) const; @@ -184,7 +183,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -192,7 +191,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -210,7 +209,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -218,7 +217,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -237,6 +236,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -246,6 +246,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/externals/eigen/Eigen/src/Core/DiagonalMatrix.h b/externals/eigen/Eigen/src/Core/DiagonalMatrix.h index ecfdce8e..542685c6 100644 --- a/externals/eigen/Eigen/src/Core/DiagonalMatrix.h +++ b/externals/eigen/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -83,6 +83,30 @@ class DiagonalBase : public EigenBase { return DiagonalWrapper(scalar * other.diagonal()); } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator+(const DiagonalBase& other) const + { + return (diagonal() + other.diagonal()).asDiagonal(); + } + + template + EIGEN_DEVICE_FUNC + #ifdef EIGEN_PARSED_BY_DOXYGEN + inline unspecified_expression_type + #else + inline const DiagonalWrapper + #endif + operator-(const DiagonalBase& other) const + { + return (diagonal() - other.diagonal()).asDiagonal(); + } }; #endif @@ -154,6 +178,30 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} + #if EIGEN_HAS_CXX11 + /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 + * + * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. + * + * \warning To construct a diagonal matrix of fixed size, the number of values passed to this + * constructor must match the fixed dimension of \c *this. + * + * \sa DiagonalMatrix(const Scalar&, const Scalar&) + * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args) + : m_diagonal(a0, a1, a2, args...) {} + + /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) + : m_diagonal(list) {} + #endif // EIGEN_HAS_CXX11 + /** Copy constructor. */ template EIGEN_DEVICE_FUNC @@ -273,7 +321,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/externals/eigen/Eigen/src/Core/DiagonalProduct.h b/externals/eigen/Eigen/src/Core/DiagonalProduct.h index d372b938..7911d1cd 100644 --- a/externals/eigen/Eigen/src/Core/DiagonalProduct.h +++ b/externals/eigen/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); diff --git a/externals/eigen/Eigen/src/Core/Dot.h b/externals/eigen/Eigen/src/Core/Dot.h index 06ef18b8..5c3441b9 100644 --- a/externals/eigen/Eigen/src/Core/Dot.h +++ b/externals/eigen/Eigen/src/Core/Dot.h @@ -31,7 +31,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.template binaryExpr(b).sum(); } @@ -43,7 +44,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.transpose().template binaryExpr(b).sum(); } @@ -65,6 +67,7 @@ struct dot_nocheck template template EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType MatrixBase::dot(const MatrixBase& other) const { @@ -83,14 +86,14 @@ MatrixBase::dot(const MatrixBase& other) const //---------- implementation of L2 norm and related functions ---------- -/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm. +/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm. * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +120,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +163,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -204,7 +207,7 @@ struct lpNorm_selector EIGEN_DEVICE_FUNC static inline RealScalar run(const MatrixBase& m) { - EIGEN_USING_STD_MATH(pow) + EIGEN_USING_STD(pow) return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p); } }; @@ -257,9 +260,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { diff --git a/externals/eigen/Eigen/src/Core/EigenBase.h b/externals/eigen/Eigen/src/Core/EigenBase.h index f76995af..6b3c7d37 100644 --- a/externals/eigen/Eigen/src/Core/EigenBase.h +++ b/externals/eigen/Eigen/src/Core/EigenBase.h @@ -14,7 +14,8 @@ namespace Eigen { /** \class EigenBase - * + * \ingroup Core_Module + * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * * In other words, an EigenBase object is an object that can be copied into a MatrixBase. @@ -28,11 +29,12 @@ namespace Eigen { template struct EigenBase { // typedef typename internal::plain_matrix_type::type PlainObject; - + /** \brief The interface type of indices * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. * \sa StorageIndex, \ref TopicPreprocessorDirectives. + * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. + * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute. */ typedef Eigen::Index Index; @@ -54,15 +56,15 @@ template struct EigenBase { return *static_cast(this); } /** \returns the number of rows. \sa cols(), RowsAtCompileTime */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/ - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } /** \returns the number of coefficients, which is rows()*cols(). * \sa rows(), cols(), SizeAtCompileTime. */ - EIGEN_DEVICE_FUNC - inline Index size() const { return rows() * cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index size() const EIGEN_NOEXCEPT { return rows() * cols(); } /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */ template @@ -128,6 +130,7 @@ template struct EigenBase */ template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const EigenBase &other) { call_assignment(derived(), other.derived()); @@ -136,6 +139,7 @@ Derived& DenseBase::operator=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator+=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -144,6 +148,7 @@ Derived& DenseBase::operator+=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator-=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); diff --git a/externals/eigen/Eigen/src/Core/ForceAlignedAccess.h b/externals/eigen/Eigen/src/Core/ForceAlignedAccess.h index 7b08b45e..817a43af 100644 --- a/externals/eigen/Eigen/src/Core/ForceAlignedAccess.h +++ b/externals/eigen/Eigen/src/Core/ForceAlignedAccess.h @@ -41,10 +41,14 @@ template class ForceAlignedAccess EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_expression.innerStride(); } EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const { diff --git a/externals/eigen/Eigen/src/Core/Fuzzy.h b/externals/eigen/Eigen/src/Core/Fuzzy.h index 3e403a09..43aa49b2 100644 --- a/externals/eigen/Eigen/src/Core/Fuzzy.h +++ b/externals/eigen/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/externals/eigen/Eigen/src/Core/GeneralProduct.h b/externals/eigen/Eigen/src/Core/GeneralProduct.h index 0f16cd8e..6906aa75 100644 --- a/externals/eigen/Eigen/src/Core/GeneralProduct.h +++ b/externals/eigen/Eigen/src/Core/GeneralProduct.h @@ -18,18 +18,33 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_GPU_COMPILE_PHASE + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -148,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template @@ -213,8 +228,7 @@ template<> struct gemv_dense_selector ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); // make sure Dest is a compile-time vector type (bug 1166) typedef typename conditional::type ActualDest; @@ -224,7 +238,7 @@ template<> struct gemv_dense_selector // on, the other hand it is good for the cache to pack the vector anyways... EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal + MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0) }; typedef const_blas_data_mapper LhsMapper; @@ -305,13 +319,12 @@ template<> struct gemv_dense_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs); enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0 }; gemv_static_vector_if static_rhs; @@ -379,11 +392,10 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ - template template -inline const Product +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const Product MatrixBase::operator*(const MatrixBase &other) const { // A note regarding the function declaration: In MSVC, this function will sometimes @@ -412,8 +424,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be @@ -427,6 +437,7 @@ MatrixBase::operator*(const MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product MatrixBase::lazyProduct(const MatrixBase &other) const { diff --git a/externals/eigen/Eigen/src/Core/GenericPacketMath.h b/externals/eigen/Eigen/src/Core/GenericPacketMath.h index 27033a2d..cf677a19 100644 --- a/externals/eigen/Eigen/src/Core/GenericPacketMath.h +++ b/externals/eigen/Eigen/src/Core/GenericPacketMath.h @@ -44,23 +44,29 @@ struct default_packet_traits enum { HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, HasSetLinear = 1, - HasBlend = 0, + HasBlend = 0, + // This flag is used to indicate whether packet comparison is supported. + // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. + HasCmp = 0, HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, + HasExpm1 = 0, HasLog = 0, HasLog1p = 0, HasLog10 = 0, @@ -81,14 +87,18 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, + HasNdtri = 0, + HasBessel = 0, HasIGamma = 0, + HasIGammaDerA = 0, + HasGammaSampleDerAlpha = 0, HasIGammac = 0, HasBetaInc = 0, HasRound = 0, + HasRint = 0, HasFloor = 0, HasCeil = 0, - HasSign = 0 }; }; @@ -119,6 +129,22 @@ template struct packet_traits : default_packet_traits template struct packet_traits : packet_traits { }; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1, + vectorizable = false, + masked_load_available=false, + masked_store_available=false + }; +}; + +template struct unpacket_traits : unpacket_traits { }; + template struct type_casting_traits { enum { VectorizedCast = 0, @@ -127,6 +153,34 @@ template struct type_casting_traits { }; }; +/** \internal Wrapper to ensure that multiple packet types can map to the same + same underlying vector type. */ +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; + + +/** \internal A convenience utility for determining if the type is a scalar. + * This is used to enable some generic packet implementations. + */ +template +struct is_scalar { + typedef typename unpacket_traits::type Scalar; + enum { + value = internal::is_same::value + }; +}; /** \internal \returns static_cast(a) (coeff-wise) */ template @@ -139,75 +193,406 @@ EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/) { return static_cast(a); } - template EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/) { return static_cast(a); } +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const SrcPacket& /*d*/, + const SrcPacket& /*e*/, const SrcPacket& /*f*/, const SrcPacket& /*g*/, const SrcPacket& /*h*/) { + return static_cast(a); +} + +/** \internal \returns reinterpret_cast(a) */ +template +EIGEN_DEVICE_FUNC inline Target +preinterpret(const Packet& a); /* { return reinterpret_cast(a); } */ /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, - const Packet& b) { return a+b; } +padd(const Packet& a, const Packet& b) { return a+b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +padd(const bool& a, const bool& b) { return a || b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, - const Packet& b) { return a-b; } +psub(const Packet& a, const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) { return -a; } -/** \internal \returns conj(a) (coeff-wise) */ +template<> EIGEN_DEVICE_FUNC inline bool +pnegate(const bool& a) { return !a; } +/** \internal \returns conj(a) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, - const Packet& b) { return a*b; } +pmul(const Packet& a, const Packet& b) { return a*b; } +// Avoid compiler warning for boolean algebra. +template<> EIGEN_DEVICE_FUNC inline bool +pmul(const bool& a, const bool& b) { return a && b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, - const Packet& b) { return a/b; } +pdiv(const Packet& a, const Packet& b) { return a/b; } + +// In the generic case, memset to all one bits. +template +struct ptrue_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){ + Packet b; + memset(static_cast(&b), 0xff, sizeof(Packet)); + return b; + } +}; -/** \internal \returns the min of \a a and \a b (coeff-wise) */ +// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value). +// Although this is technically not a valid bitmask, the scalar path for pselect +// uses a comparison to zero, so this should still work in most cases. We don't +// have another option, since the scalar type requires initialization. +template +struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ + return T(1); + } +}; + +/** \internal \returns one bits. */ template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, - const Packet& b) { return numext::mini(a, b); } +ptrue(const Packet& a) { + return ptrue_impl::run(a); +} + +// In the general case, memset to zero. +template +struct pzero_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) { + Packet b; + memset(static_cast(&b), 0x00, sizeof(Packet)); + return b; + } +}; -/** \internal \returns the max of \a a and \a b (coeff-wise) */ +// For scalars, explicitly set to Scalar(0), since the underlying representation +// for zero may not consist of all-zero bits. +template +struct pzero_impl::value>::type> { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { + return T(0); + } +}; + +/** \internal \returns packet of zeros */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, - const Packet& b) { return numext::maxi(a, b); } +pzero(const Packet& a) { + return pzero_impl::run(a); +} -/** \internal \returns the absolute value of \a a */ +/** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -pabs(const Packet& a) { using std::abs; return abs(a); } +pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } -/** \internal \returns the phase angle of \a a */ +/** \internal \returns a < b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet -parg(const Packet& a) { using numext::arg; return arg(a); } +pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet +pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } + +/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ +template EIGEN_DEVICE_FUNC inline Packet +pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } + +template +struct bit_and { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a & b; + } +}; + +template +struct bit_or { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a | b; + } +}; + +template +struct bit_xor { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { + return a ^ b; + } +}; + +template +struct bit_not { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { + return ~a; + } +}; + +// Use operators &, |, ^, ~. +template +struct operator_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not()(a); } +}; + +// Apply binary operations byte-by-byte +template +struct bytewise_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { + return binary(a, b, bit_and()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + return binary(a, b, bit_or()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { + return binary(a, b, bit_xor()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + return unary(a,bit_not()); + } + + private: + template + EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++); + } + return c; + } + + template + EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + const unsigned char* b_ptr = reinterpret_cast(&b); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++, *b_ptr++); + } + return c; + } +}; + +// In the general case, use byte-by-byte manipulation. +template +struct bitwise_helper : public bytewise_bitwise_helper {}; + +// For integers or non-trivial scalars, use binary operators. +template +struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + > : public operator_bitwise_helper {}; /** \internal \returns the bitwise and of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pand(const Packet& a, const Packet& b) { return a & b; } +pand(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_and(a, b); +} /** \internal \returns the bitwise or of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -por(const Packet& a, const Packet& b) { return a | b; } +por(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_or(a, b); +} /** \internal \returns the bitwise xor of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pxor(const Packet& a, const Packet& b) { return a ^ b; } +pxor(const Packet& a, const Packet& b) { + return bitwise_helper::bitwise_xor(a, b); +} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + return bitwise_helper::bitwise_not(a); +} -/** \internal \returns the bitwise andnot of \a a and \a b */ +/** \internal \returns the bitwise and of \a a and not \a b */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (!b); } +pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); } + +// In the general case, use bitwise select. +template +struct pselect_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); + } +}; + +// For scalars, use ternary select. +template +struct pselect_impl::value>::type > { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return numext::equal_strict(mask, Packet(0)) ? b : a; + } +}; + +/** \internal \returns \a or \b for each field in packet according to \mask */ +template EIGEN_DEVICE_FUNC inline Packet +pselect(const Packet& mask, const Packet& a, const Packet& b) { + return pselect_impl::run(mask, a, b); +} + +template<> EIGEN_DEVICE_FUNC inline bool pselect( + const bool& cond, const bool& a, const bool& b) { + return cond ? a : b; +} + +/** \internal \returns the min or of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, the result is implementation defined. */ +template +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + return op(a,b); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If either \a a or \a b are NaN, NaN is returned. */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), b), + a); + } +}; + +/** \internal \returns the min or max of \a a and \a b (coeff-wise) + If both \a a and \a b are NaN, NaN is returned. + Equivalent to std::fmin(a, b). */ +template<> +struct pminmax_impl { + template + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) { + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet not_nan_mask_b = pcmp_eq(b, b); + return pselect(not_nan_mask_a, + pselect(not_nan_mask_b, op(a, b), a), + b); + } +}; + + +#ifndef SYCL_DEVICE_ONLY +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) Func +#else +#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) \ +[](const Type& a, const Type& b) { \ + return Func(a, b);} +#endif + +/** \internal \returns the min of \a a and \a b (coeff-wise). + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, const Packet& b) { return numext::mini(a,b); } + +/** \internal \returns the min of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin))); +} + +/** \internal \returns the max of \a a and \a b (coeff-wise) + If \a a or \b b is NaN, the return value is implementation defined. */ +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } + +/** \internal \returns the max of \a a and \a b (coeff-wise). + NaNPropagation determines the NaN propagation semantics. */ +template +EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) { + return pminmax_impl::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet,(pmax))); +} + +/** \internal \returns the absolute value of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pabs(const Packet& a) { return numext::abs(a); } +template<> EIGEN_DEVICE_FUNC inline unsigned int +pabs(const unsigned int& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long +pabs(const unsigned long& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline unsigned long long +pabs(const unsigned long long& a) { return a; } + +/** \internal \returns the addsub value of \a a,b */ +template EIGEN_DEVICE_FUNC inline Packet +paddsub(const Packet& a, const Packet& b) { + return pselect(peven_mask(a), padd(a, b), psub(a, b)); + } + +/** \internal \returns the phase angle of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +parg(const Packet& a) { using numext::arg; return arg(a); } + + +/** \internal \returns \a a logically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +parithmetic_shift_right(const int& a) { return a >> N; } +template EIGEN_DEVICE_FUNC inline long int +parithmetic_shift_right(const long int& a) { return a >> N; } + +/** \internal \returns \a a arithmetically shifted by N bits to the right */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } + +/** \internal \returns \a a shifted by N bits to the left */ +template EIGEN_DEVICE_FUNC inline int +plogical_shift_left(const int& a) { return a << N; } +template EIGEN_DEVICE_FUNC inline long int +plogical_shift_left(const long int& a) { return a << N; } + +/** \internal \returns the significant and exponent of the underlying floating point numbers + * See https://en.cppreference.com/w/cpp/numeric/math/frexp + */ +template +EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { + int exp; + EIGEN_USING_STD(frexp); + Packet result = static_cast(frexp(a, &exp)); + exponent = static_cast(exp); + return result; +} + +/** \internal \returns a * 2^((int)exponent) + * See https://en.cppreference.com/w/cpp/numeric/math/ldexp + */ +template EIGEN_DEVICE_FUNC inline Packet +pldexp(const Packet &a, const Packet &exponent) { + EIGEN_USING_STD(ldexp) + return static_cast(ldexp(a, static_cast(exponent))); +} + +/** \internal \returns the min of \a a and \a b (coeff-wise) */ +template EIGEN_DEVICE_FUNC inline Packet +pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -217,10 +602,22 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } +/** \internal \returns a packet version of \a *from, (un-aligned masked load) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template EIGEN_DEVICE_FUNC inline +typename enable_if::masked_load_available, Packet>::type +ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); + /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients set from bits */ +template EIGEN_DEVICE_FUNC inline Packet +pset1frombits(BitsType a); + /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -230,14 +627,14 @@ pload1(const typename unpacket_traits::type *a) { return pset1( * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. */ -template EIGEN_DEVICE_FUNC inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns a packet with elements of \a *from quadrupled. * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]} * Currently, this function is only used in matrix products. - * For packet-size smaller or equal to 4, this function is equivalent to pload1 + * For packet-size smaller or equal to 4, this function is equivalent to pload1 */ template EIGEN_DEVICE_FUNC inline Packet ploadquad(const typename unpacket_traits::type* from) @@ -278,9 +675,23 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0), + where x is the value of all 1-bits. */ +template EIGEN_DEVICE_FUNC inline Packet +peven_mask(const Packet& /*a*/) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + for(size_t i = 0; i < n; ++i) { + memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar)); + } + return ploadu(elements); +} + + /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } @@ -289,6 +700,15 @@ template EIGEN_DEVICE_FUNC inline void pstore( template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } +/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) + * There is no generic implementation. We only have implementations for specialized + * cases. Generic case should not be called. + */ +template +EIGEN_DEVICE_FUNC inline +typename enable_if::masked_store_available, void>::type +pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) { return ploadu(from); } @@ -298,8 +718,10 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ -#if defined(__LP64__) +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) +#if defined(__LP64__) || EIGEN_OS_WIN64 // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); #else @@ -311,39 +733,6 @@ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a #endif } -/** \internal \returns the first element of a packet */ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) -{ return a; } - -/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ -template EIGEN_DEVICE_FUNC inline Packet -preduxp(const Packet* vecs) { return vecs[0]; } - -/** \internal \returns the sum of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) -{ return a; } - -/** \internal \returns the sum of the elements of \a a by block of 4 elements. - * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} - * For packet-size smaller or equal to 4, this boils down to a noop. - */ -template EIGEN_DEVICE_FUNC inline -typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) -{ return a; } - -/** \internal \returns the product of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) -{ return a; } - -/** \internal \returns the min of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) -{ return a; } - -/** \internal \returns the max of the elements of \a a*/ -template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) -{ return a; } - /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } @@ -351,10 +740,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - // FIXME: uncomment the following in case we drop the internal imag and real functions. -// using std::imag; -// using std::real; - return Packet(imag(a),real(a)); + return Packet(numext::imag(a),numext::real(a)); } /************************** @@ -363,47 +749,51 @@ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet /** \internal \returns the sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psin(const Packet& a) { using std::sin; return sin(a); } +Packet psin(const Packet& a) { EIGEN_USING_STD(sin); return sin(a); } /** \internal \returns the cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcos(const Packet& a) { using std::cos; return cos(a); } +Packet pcos(const Packet& a) { EIGEN_USING_STD(cos); return cos(a); } /** \internal \returns the tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptan(const Packet& a) { using std::tan; return tan(a); } +Packet ptan(const Packet& a) { EIGEN_USING_STD(tan); return tan(a); } /** \internal \returns the arc sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pasin(const Packet& a) { using std::asin; return asin(a); } +Packet pasin(const Packet& a) { EIGEN_USING_STD(asin); return asin(a); } /** \internal \returns the arc cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pacos(const Packet& a) { using std::acos; return acos(a); } +Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); } /** \internal \returns the arc tangent of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { using std::atan; return atan(a); } +Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); } /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } +Packet psinh(const Packet& a) { EIGEN_USING_STD(sinh); return sinh(a); } /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } +Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); } /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } +Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); } /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexp(const Packet& a) { using std::exp; return exp(a); } +Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); } + +/** \internal \returns the expm1 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pexpm1(const Packet& a) { return numext::expm1(a); } /** \internal \returns the log of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog(const Packet& a) { using std::log; return log(a); } +Packet plog(const Packet& a) { EIGEN_USING_STD(log); return log(a); } /** \internal \returns the log1p of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -411,16 +801,24 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); } /** \internal \returns the log10 of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog10(const Packet& a) { using std::log10; return log10(a); } +Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); } + +/** \internal \returns the log10 of \a a (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog2(const Packet& a) { + typedef typename internal::unpacket_traits::type Scalar; + return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); +} /** \internal \returns the square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } +Packet psqrt(const Packet& a) { return numext::sqrt(a); } /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet prsqrt(const Packet& a) { - return pdiv(pset1(1), psqrt(a)); + typedef typename internal::unpacket_traits::type Scalar; + return pdiv(pset1(Scalar(1)), psqrt(a)); } /** \internal \returns the rounded value of \a a (coeff-wise) */ @@ -431,15 +829,121 @@ Packet pround(const Packet& a) { using numext::round; return round(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } +/** \internal \returns the rounded value of \a a (coeff-wise) with current + * rounding mode */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet print(const Packet& a) { using numext::rint; return rint(a); } + /** \internal \returns the ceil of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +/** \internal \returns the first element of a packet */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +pfirst(const Packet& a) +{ return a; } + +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. + * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} + * For packet-size smaller or equal to 4, this boils down to a noop. + */ +template +EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux_half_dowto4(const Packet& a) +{ return a; } + +// Slow generic implementation of Packet reduction. +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux_helper(const Packet& a, Op op) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n]; + pstoreu(elements, a); + for(size_t k = n / 2; k > 0; k /= 2) { + for(size_t i = 0; i < k; ++i) { + elements[i] = op(elements[i], elements[i + k]); + } + } + return elements[0]; +} + +/** \internal \returns the sum of the elements of \a a*/ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type +predux(const Packet& a) +{ + return a; +} + +/** \internal \returns the product of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); +} + +/** \internal \returns the min of the elements of \a a */ +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet &a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +template +EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( + const Packet& a) { + typedef typename unpacket_traits::type Scalar; + return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); +} + +#undef EIGEN_BINARY_OP_NAN_PROPAGATION + +/** \internal \returns true if all coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +// not needed yet +// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) +// { return bool(a); } + +/** \internal \returns true if any coeffs of \a a means "true" + * It is supposed to be called on values returned by pcmp_*. + */ +template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) +{ + // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. + // It is expected that "true" is either: + // - Scalar(1) + // - bits full of ones (NaN for floats), + // - or first bit equals to 1 (1 for ints, smallest denormal for floats). + // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. + typedef typename unpacket_traits::type Scalar; + return numext::not_equal_strict(predux(a), Scalar(0)); +} + /*************************************************************************** * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ -/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ +/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) template inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) @@ -482,52 +986,23 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro * by the current computation. */ template -inline Packet ploadt_ro(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) { return ploadt(from); } -/** \internal default implementation of palign() allowing partial specialization */ -template -struct palign_impl -{ - // by default data are aligned, so there is nothing to be done :) - static inline void run(PacketType&, const PacketType&) {} -}; - -/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements - * of \a first and \a Offset first elements of \a second. - * - * This function is currently only used to optimize matrix-vector products on unligned matrices. - * It takes 2 packets that represent a contiguous memory array, and returns a packet starting - * at the position \a Offset. For instance, for packets of 4 elements, we have: - * Input: - * - first = {f0,f1,f2,f3} - * - second = {s0,s1,s2,s3} - * Output: - * - if Offset==0 then {f0,f1,f2,f3} - * - if Offset==1 then {f1,f2,f3,s0} - * - if Offset==2 then {f2,f3,s0,s1} - * - if Offset==3 then {f3,s0,s1,s3} - */ -template -inline void palign(PacketType& first, const PacketType& second) -{ - palign_impl::run(first,second); -} - /*************************************************************************** * Fast complex products (GCC generates a function call which is very slow) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } #endif @@ -558,34 +1033,6 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/** \internal \returns \a a with the first coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertfirst(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - mask.select[0] = true; - // This for loop should be optimized away by the compiler. - for(Index i=1; i::size; ++i) - mask.select[i] = false; - return pblend(mask, pset1(b), a); -} - -/** \internal \returns \a a with the last coefficient replaced by the scalar b */ -template EIGEN_DEVICE_FUNC inline Packet -pinsertlast(const Packet& a, typename unpacket_traits::type b) -{ - // Default implementation based on pblend. - // It must be specialized for higher performance. - Selector::size> mask; - // This for loop should be optimized away by the compiler. - for(Index i=0; i::size-1; ++i) - mask.select[i] = false; - mask.select[unpacket_traits::size-1] = true; - return pblend(mask, pset1(b), a); -} - } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/GlobalFunctions.h b/externals/eigen/Eigen/src/Core/GlobalFunctions.h index 769dc255..629af94b 100644 --- a/externals/eigen/Eigen/src/Core/GlobalFunctions.h +++ b/externals/eigen/Eigen/src/Core/GlobalFunctions.h @@ -66,21 +66,31 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) +#if EIGEN_HAS_CXX11_MATH + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) +#endif + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2,scalar_log2_op,base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op,absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op,squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op,complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op,square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil) @@ -88,7 +98,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) - + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. * * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). @@ -102,17 +112,18 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { - return x.derived().pow(exponent); - } - - template - inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) - pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { - return x.derived().pow(exponent); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) + { + typedef typename internal::promote_scalar_arg::type PromotedExponent; + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), + typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); } #endif @@ -122,21 +133,21 @@ namespace Eigen * * Example: \include Cwise_array_power_array.cpp * Output: \verbinclude Cwise_array_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase */ template inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> - pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) + pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( x.derived(), exponents.derived() ); } - + /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. * * This function computes the coefficient-wise power between a scalar and an array of exponents. @@ -145,7 +156,7 @@ namespace Eigen * * Example: \include Cwise_scalar_power_array.cpp * Output: \verbinclude Cwise_scalar_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase @@ -155,21 +166,17 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type - pow(const Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); - } - - template - inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) - pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + pow(const Scalar& x, const Eigen::ArrayBase& exponents) { + typedef typename internal::promote_scalar_arg::type PromotedScalar; + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); } #endif diff --git a/externals/eigen/Eigen/src/Core/IO.h b/externals/eigen/Eigen/src/Core/IO.h index da7fd6cc..e81c3152 100644 --- a/externals/eigen/Eigen/src/Core/IO.h +++ b/externals/eigen/Eigen/src/Core/IO.h @@ -41,6 +41,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix + * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -53,9 +54,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="") + const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -71,6 +72,7 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; + char fill; int precision; int flags; }; @@ -128,6 +130,9 @@ struct significant_decimals_impl template std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt) { + using internal::is_same; + using internal::conditional; + if(_m.size() == 0) { s << fmt.matPrefix << fmt.matSuffix; @@ -136,6 +141,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& typename Derived::Nested m = _m; typedef typename Derived::Scalar Scalar; + typedef typename + conditional< + is_same::value || + is_same::value || + is_same::value || + is_same::value, + int, + typename conditional< + is_same >::value || + is_same >::value || + is_same >::value || + is_same >::value, + std::complex, + const Scalar& + >::type + >::type PrintType; Index width = 0; @@ -172,23 +193,31 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& { std::stringstream sstr; sstr.copyfmt(s); - sstr << m.coeff(i,j); + sstr << static_cast(m.coeff(i,j)); width = std::max(width, Index(sstr.str().length())); } } + std::streamsize old_width = s.width(); + char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) s.width(width); - s << m.coeff(i, 0); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, 0)); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if (width) s.width(width); - s << m.coeff(i, j); + if(width) { + s.fill(fmt.fill); + s.width(width); + } + s << static_cast(m.coeff(i, j)); } s << fmt.rowSuffix; if( i < m.rows() - 1) @@ -196,6 +225,10 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); + if(width) { + s.fill(old_fill_character); + s.width(old_width); + } return s; } diff --git a/externals/eigen/Eigen/src/Core/IndexedView.h b/externals/eigen/Eigen/src/Core/IndexedView.h new file mode 100644 index 00000000..08476251 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/IndexedView.h @@ -0,0 +1,237 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_INDEXED_VIEW_H +#define EIGEN_INDEXED_VIEW_H + +namespace Eigen { + +namespace internal { + +template +struct traits > + : traits +{ + enum { + RowsAtCompileTime = int(array_size::value), + ColsAtCompileTime = int(array_size::value), + MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic, + MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic, + + XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, + IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : XprTypeIsRowMajor, + + RowIncr = int(get_compile_time_incr::value), + ColIncr = int(get_compile_time_incr::value), + InnerIncr = IsRowMajor ? ColIncr : RowIncr, + OuterIncr = IsRowMajor ? RowIncr : ColIncr, + + HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), + XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time::ret) : int(outer_stride_at_compile_time::ret), + XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), + + InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime, + IsBlockAlike = InnerIncr==1 && OuterIncr==1, + IsInnerPannel = HasSameStorageOrderAsXprType && is_same,typename conditional::type>::value, + + InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr, + OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr, + + ReturnAsScalar = is_same::value && is_same::value, + ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike, + ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock), + + // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, + // but this is too strict regarding negative strides... + DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, + FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + Flags = (traits::Flags & (HereditaryBits | DirectAccessMask )) | FlagsLvalueBit | FlagsRowMajorBit | FlagsLinearAccessBit + }; + + typedef Block BlockType; +}; + +} + +template +class IndexedViewImpl; + + +/** \class IndexedView + * \ingroup Core_Module + * + * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices + * + * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns + * \tparam RowIndices the type of the object defining the sequence of row indices + * \tparam ColIndices the type of the object defining the sequence of column indices + * + * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection + * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$ + * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$ be the nested matrix, then the resulting matrix \f$ B \f$ has \c m + * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$. + * + * The \c RowIndices and \c ColIndices types must be compatible with the following API: + * \code + * operator[](Index) const; + * Index size() const; + * \endcode + * + * Typical supported types thus include: + * - std::vector + * - std::valarray + * - std::array + * - Plain C arrays: int[N] + * - Eigen::ArrayXi + * - decltype(ArrayXi::LinSpaced(...)) + * - Any view/expressions of the previous types + * - Eigen::ArithmeticSequence + * - Eigen::internal::AllRange (helper for Eigen::all) + * - Eigen::internal::SingleRange (helper for single index) + * - etc. + * + * In typical usages of %Eigen, this class should never be used directly. It is the return type of + * DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * \sa class Block + */ +template +class IndexedView : public IndexedViewImpl::StorageKind> +{ +public: + typedef typename IndexedViewImpl::StorageKind>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + template + IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) + : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) + {} + + /** \returns number of rows */ + Index rows() const { return internal::size(m_rowIndices); } + + /** \returns number of columns */ + Index cols() const { return internal::size(m_colIndices); } + + /** \returns the nested expression */ + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + /** \returns a const reference to the object storing/generating the row indices */ + const RowIndices& rowIndices() const { return m_rowIndices; } + + /** \returns a const reference to the object storing/generating the column indices */ + const ColIndices& colIndices() const { return m_colIndices; } + +protected: + MatrixTypeNested m_xpr; + RowIndices m_rowIndices; + ColIndices m_colIndices; +}; + + +// Generic API dispatcher +template +class IndexedViewImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; + +namespace internal { + + +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef IndexedView XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of row/col index */, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1) ? LinearAccessBit : 0, + + FlagsRowMajorBit = traits::FlagsRowMajorBit, + + Flags = (evaluator::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) | FlagsLinearAccessBit | FlagsRowMajorBit, + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const + { + Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; + Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); + } + +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_H diff --git a/externals/eigen/Eigen/src/Core/Inverse.h b/externals/eigen/Eigen/src/Core/Inverse.h index b76f0439..c514438c 100644 --- a/externals/eigen/Eigen/src/Core/Inverse.h +++ b/externals/eigen/Eigen/src/Core/Inverse.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014 Gael Guennebaud +// Copyright (C) 2014-2019 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,7 +10,7 @@ #ifndef EIGEN_INVERSE_H #define EIGEN_INVERSE_H -namespace Eigen { +namespace Eigen { template class InverseImpl; @@ -44,19 +44,18 @@ class Inverse : public InverseImpl::S { public: typedef typename XprType::StorageIndex StorageIndex; - typedef typename XprType::PlainObject PlainObject; typedef typename XprType::Scalar Scalar; typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type XprTypeNestedCleaned; typedef typename internal::ref_selector::type Nested; typedef typename internal::remove_all::type NestedExpression; - + explicit EIGEN_DEVICE_FUNC Inverse(const XprType &xpr) : m_xpr(xpr) {} - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.rows(); } EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; } @@ -82,7 +81,7 @@ namespace internal { /** \internal * \brief Default evaluator for Inverse expression. - * + * * This default evaluator for Inverse expression simply evaluate the inverse into a temporary * by a call to internal::call_assignment_no_alias. * Therefore, inverse implementers only have to specialize Assignment, ...> for @@ -97,7 +96,7 @@ struct unary_evaluator > typedef Inverse InverseType; typedef typename InverseType::PlainObject PlainObject; typedef evaluator Base; - + enum { Flags = Base::Flags | EvalBeforeNestingBit }; unary_evaluator(const InverseType& inv_xpr) @@ -106,11 +105,11 @@ struct unary_evaluator > ::new (static_cast(this)) Base(m_result); internal::call_assignment_no_alias(m_result, inv_xpr); } - + protected: PlainObject m_result; }; - + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/Map.h b/externals/eigen/Eigen/src/Core/Map.h index 06d19670..218cc157 100644 --- a/externals/eigen/Eigen/src/Core/Map.h +++ b/externals/eigen/Eigen/src/Core/Map.h @@ -11,7 +11,7 @@ #ifndef EIGEN_MAP_H #define EIGEN_MAP_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -41,7 +47,7 @@ struct traits > * \brief A matrix or vector expression mapping an existing array of data. * * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. * The default is \c #Unaligned. * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout * of an ordinary, contiguous array. This can be overridden by specifying strides. @@ -98,19 +104,20 @@ template class Ma EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/externals/eigen/Eigen/src/Core/MapBase.h b/externals/eigen/Eigen/src/Core/MapBase.h index 020f939a..d856447f 100644 --- a/externals/eigen/Eigen/src/Core/MapBase.h +++ b/externals/eigen/Eigen/src/Core/MapBase.h @@ -15,7 +15,7 @@ EIGEN_STATIC_ASSERT((int(internal::evaluator::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \ YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT) -namespace Eigen { +namespace Eigen { /** \ingroup Core_Module * @@ -43,6 +43,7 @@ template class MapBase enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, + InnerStrideAtCompileTime = internal::traits::InnerStrideAtCompileTime, SizeAtCompileTime = Base::SizeAtCompileTime }; @@ -86,9 +87,11 @@ template class MapBase typedef typename Base::CoeffReturnType CoeffReturnType; /** \copydoc DenseBase::rows() */ - EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_rows.value(); } /** \copydoc DenseBase::cols() */ - EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_cols.value(); } /** Returns a pointer to the first coefficient of the matrix or vector. * @@ -181,14 +184,19 @@ template class MapBase #endif protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) template EIGEN_DEVICE_FUNC void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 + // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value: + const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime); + EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride); eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits::Alignment) == 0) - || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); + || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } @@ -290,6 +298,9 @@ template class MapBase // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, // see bugs 821 and 920. using ReadOnlyMapBase::Base::operator=; + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/externals/eigen/Eigen/src/Core/MathFunctions.h b/externals/eigen/Eigen/src/Core/MathFunctions.h index 8d47fb8a..61b78f4f 100644 --- a/externals/eigen/Eigen/src/Core/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/MathFunctions.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2006-2010 Benoit Jacob +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -10,10 +11,11 @@ #ifndef EIGEN_MATHFUNCTIONS_H #define EIGEN_MATHFUNCTIONS_H -// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html // TODO this should better be moved to NumTraits -#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L - +// Source: WolframAlpha +#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L +#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L +#define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L namespace Eigen { @@ -97,7 +99,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -145,7 +147,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -213,12 +215,12 @@ struct imag_ref_default_impl template struct imag_ref_default_impl { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Scalar run(Scalar&) { return Scalar(0); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline const Scalar run(const Scalar&) { return Scalar(0); @@ -239,7 +241,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -249,7 +251,7 @@ struct conj_impl }; template -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -259,6 +261,9 @@ struct conj_impl } }; +template::IsComplex> +struct conj_impl : conj_default_impl {}; + template struct conj_retval { @@ -287,7 +292,7 @@ struct abs2_impl_default // IsComplex EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return real(x)*real(x) + imag(x)*imag(x); + return x.real()*x.real() + x.imag()*x.imag(); } }; @@ -308,19 +313,81 @@ struct abs2_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of sqrt/rsqrt * +****************************************************************************/ + +template +struct sqrt_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x) + { + EIGEN_USING_STD(sqrt); + return sqrt(x); + } +}; + +// Complex sqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& a_x); + +// Custom implementation is faster than `std::sqrt`, works on +// GPU, and correctly handles special cases (unlike MSVC). +template +struct sqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_sqrt(x); + } +}; + +template +struct sqrt_retval +{ + typedef Scalar type; +}; + +// Default implementation relies on numext::sqrt, at bottom of file. +template +struct rsqrt_impl; + +// Complex rsqrt defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& a_x); + +template +struct rsqrt_impl > +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE std::complex run(const std::complex& x) + { + return complex_rsqrt(x); + } +}; + +template +struct rsqrt_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of norm1 * ****************************************************************************/ template -struct norm1_default_impl +struct norm1_default_impl; + +template +struct norm1_default_impl { typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); - return abs(real(x)) + abs(imag(x)); + EIGEN_USING_STD(abs); + return abs(x.real()) + abs(x.imag()); } }; @@ -330,7 +397,7 @@ struct norm1_default_impl EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(abs); return abs(x); } }; @@ -348,31 +415,7 @@ struct norm1_retval * Implementation of hypot * ****************************************************************************/ -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - if(p==RealScalar(0)) return RealScalar(0); - return p * sqrt(RealScalar(1) + qp*qp); - } -}; +template struct hypot_impl; template struct hypot_retval @@ -384,7 +427,7 @@ struct hypot_retval * Implementation of cast * ****************************************************************************/ -template +template struct cast_impl { EIGEN_DEVICE_FUNC @@ -394,6 +437,22 @@ struct cast_impl } }; +// Casting from S -> Complex leads to an implicit conversion from S to T, +// generating warnings on clang. Here we explicitly cast the real component. +template +struct cast_impl::IsComplex && NumTraits::IsComplex + >::type> +{ + EIGEN_DEVICE_FUNC + static inline NewType run(const OldType& x) + { + typedef typename NumTraits::Real NewReal; + return static_cast(static_cast(x)); + } +}; + // here, for once, we're plainly returning NewType: we don't want cast to do weird things. template @@ -407,29 +466,59 @@ inline NewType cast(const OldType& x) * Implementation of round * ****************************************************************************/ +template +struct round_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) #if EIGEN_HAS_CXX11_MATH - template - struct round_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - using std::round; - return round(x); - } - }; + EIGEN_USING_STD(round); +#endif + return Scalar(round(x)); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +#if EIGEN_HAS_C99_MATH +// Use ::roundf for float. +template<> +struct round_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::roundf(x); + } +}; #else - template - struct round_impl +template +struct round_using_floor_ceil_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) { - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(floor); - EIGEN_USING_STD_MATH(ceil); - return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5)); + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + // Without C99 round/roundf, resort to floor/ceil. + EIGEN_USING_STD(floor); + EIGEN_USING_STD(ceil); + // If not enough precision to resolve a decimal at all, return the input. + // Otherwise, adding 0.5 can trigger an increment by 1. + const Scalar limit = Scalar(1ull << (NumTraits::digits() - 1)); + if (x >= limit || x <= -limit) { + return x; } - }; -#endif + return (x > Scalar(0)) ? Scalar(floor(x + Scalar(0.5))) : Scalar(ceil(x - Scalar(0.5))); + } +}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; + +template<> +struct round_impl : round_using_floor_ceil_impl {}; +#endif // EIGEN_HAS_C99_MATH +#endif // !EIGEN_HAS_CXX11_MATH template struct round_retval @@ -438,43 +527,112 @@ struct round_retval }; /**************************************************************************** -* Implementation of arg * +* Implementation of rint * ****************************************************************************/ +template +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) #if EIGEN_HAS_CXX11_MATH - template - struct arg_impl { - static inline Scalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; -#else - template::IsComplex> - struct arg_default_impl + EIGEN_USING_STD(rint); +#endif + return rint(x); + } +}; + +#if !EIGEN_HAS_CXX11_MATH +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline double run(const double& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); } - }; + return ::rint(x); + } +}; +template<> +struct rint_impl { + EIGEN_DEVICE_FUNC + static inline float run(const float& x) + { + return ::rintf(x); + } +}; +#endif - template - struct arg_default_impl +template +struct rint_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of arg * +****************************************************************************/ + +// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. +// This seems to be fixed in VS 2019. +#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) +// std::arg is only defined for types of std::complex, or integer types or float/double/long double +template::IsComplex || is_integral::value + || is_same::value || is_same::value + || is_same::value > +struct arg_default_impl; + +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) - { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } - }; + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else + EIGEN_USING_STD(arg); + #endif + return static_cast(arg(x)); + } +}; - template struct arg_impl : arg_default_impl {}; +// Must be non-complex floating-point type (e.g. half/bfloat16). +template +struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; +#else +template::IsComplex> +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); + } +}; + +template +struct arg_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + EIGEN_USING_STD(arg); + return arg(x); + } +}; #endif +template struct arg_impl : arg_default_impl {}; template struct arg_retval @@ -482,6 +640,80 @@ struct arg_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of expm1 * +****************************************************************************/ + +// This implementation is based on GSL Math's expm1. +namespace std_fallback { + // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar, + // or that there is no suitable std::expm1 function available. Implementation + // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php. + template + EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + + EIGEN_USING_STD(exp); + Scalar u = exp(x); + if (numext::equal_strict(u, Scalar(1))) { + return x; + } + Scalar um1 = u - RealScalar(1); + if (numext::equal_strict(um1, Scalar(-1))) { + return RealScalar(-1); + } + + EIGEN_USING_STD(log); + Scalar logu = log(u); + return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; + } +} + +template +struct expm1_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + #if EIGEN_HAS_CXX11_MATH + using std::expm1; + #else + using std_fallback::expm1; + #endif + return expm1(x); + } +}; + +template +struct expm1_retval +{ + typedef Scalar type; +}; + +/**************************************************************************** +* Implementation of log * +****************************************************************************/ + +// Complex log defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z); + +template +struct log_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD(log); + return static_cast(log(x)); + } +}; + +template +struct log_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& z) + { + return complex_log(z); + } +}; + /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -493,25 +725,38 @@ namespace std_fallback { EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) typedef typename NumTraits::Real RealScalar; - EIGEN_USING_STD_MATH(log); + EIGEN_USING_STD(log); Scalar x1p = RealScalar(1) + x; - return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + Scalar log_1p = log_impl::run(x1p); + const bool is_small = numext::equal_strict(x1p, Scalar(1)); + const bool is_inf = numext::equal_strict(x1p, log_1p); + return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); } } template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH using std::log1p; - #endif + #else using std_fallback::log1p; + #endif return log1p(x); } }; +// Specialization for complex types that are not supported by std::log1p. +template +struct log1p_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + return std_fallback::log1p(x); + } +}; template struct log1p_retval @@ -530,7 +775,7 @@ struct pow_impl typedef typename ScalarBinaryOpTraits >::ReturnType result_type; static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y) { - EIGEN_USING_STD_MATH(pow); + EIGEN_USING_STD(pow); return pow(x, y); } }; @@ -640,21 +885,28 @@ template struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) - { - typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; - if(y=x the result converted to an unsigned long is still correct. - std::size_t range = ScalarX(y)-ScalarX(x); - std::size_t offset = 0; - // rejection sampling - std::size_t divisor = 1; - std::size_t multiplier = 1; - if(range::type ScalarU; + // ScalarX is the widest of ScalarU and unsigned int. + // We'll deal only with ScalarX and unsigned int below thus avoiding signed + // types and arithmetic and signed overflows (which are undefined behavior). + typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX; + // The following difference doesn't overflow, provided our integer types are two's + // complement and have the same number of padding bits in signed and unsigned variants. + // This is the case in most modern implementations of C++. + ScalarX range = ScalarX(y) - ScalarX(x); + ScalarX offset = 0; + ScalarX divisor = 1; + ScalarX multiplier = 1; + const unsigned rand_max = RAND_MAX; + if (range <= rand_max) divisor = (rand_max + 1) / (range + 1); + else multiplier = 1 + range / (rand_max + 1); + // Rejection sampling. do { - offset = (std::size_t(std::rand()) * multiplier) / divisor; + offset = (unsigned(std::rand()) * multiplier) / divisor; } while (offset > range); return Scalar(ScalarX(x) + offset); } @@ -679,8 +931,8 @@ struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) { - return Scalar(random(real(x), real(y)), - random(imag(x), imag(y))); + return Scalar(random(x.real(), y.real()), + random(x.imag(), y.imag())); } static inline Scalar run() { @@ -701,7 +953,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) @@ -730,7 +982,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -745,7 +997,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -760,7 +1012,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -817,7 +1069,6 @@ template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); template T generic_fast_tanh_float(const T& a_x); - } // end namespace internal /**************************************************************************** @@ -826,12 +1077,12 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#ifndef __CUDA_ARCH__ +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - EIGEN_USING_STD_MATH(min); + EIGEN_USING_STD(min) return min EIGEN_NOT_A_MACRO (x,y); } @@ -839,7 +1090,7 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - EIGEN_USING_STD_MATH(max); + EIGEN_USING_STD(max) return max EIGEN_NOT_A_MACRO (x,y); } #else @@ -855,6 +1106,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fminl" on HIP yet + return (x < y) ? x : y; +#else + return fminl(x, y); +#endif +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -867,6 +1136,92 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fmaxl" on HIP yet + return (x > y) ? x : y; +#else + return fmaxl(x, y); +#endif +} +#endif + +#if defined(SYCL_DEVICE_ONLY) + + +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) +#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) +#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) + +#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ +template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ + return cl::sycl::FUNC(x); \ + } + +#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) + +#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ + template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \ + return cl::sycl::FUNC(x, y); \ + } + +#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ + SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) + +#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) + +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) + #endif @@ -933,6 +1288,37 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); } +EIGEN_DEVICE_FUNC +inline bool abs2(bool x) { return x; } + +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) +{ + return x > y ? x - y : y - x; +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y) +{ + return fabsf(x - y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) +{ + return fabs(x - y); +} + +#if !defined(EIGEN_GPUCC) +// HIP and CUDA do not support long double. +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { + return fabsl(x - y); +} +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -947,6 +1333,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) + SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) +#endif + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) @@ -954,7 +1344,11 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -969,10 +1363,27 @@ inline typename internal::pow_impl::result_type pow(const Scala return internal::pow_impl::run(x, y); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) +#endif + template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x); +} + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) @@ -980,15 +1391,23 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); } +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) +#endif + template EIGEN_DEVICE_FUNC T (floor)(const T& x) { - EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD(floor) return floor(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1000,11 +1419,15 @@ template EIGEN_DEVICE_FUNC T (ceil)(const T& x) { - EIGEN_USING_STD_MATH(ceil); + EIGEN_USING_STD(ceil); return ceil(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1030,28 +1453,49 @@ inline int log2(int x) /** \returns the square root of \a x. * - * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * It is essentially equivalent to + * \code using std::sqrt; return sqrt(x); \endcode * but slightly faster for float/double and some compilers (e.g., gcc), thanks to * specializations when SSE is enabled. * * It's usage is justified in performance critical functions, like norm/normalize. */ +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x); +} + +// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool). +template<> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +bool sqrt(const bool &x) { return x; } + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) +#endif + +/** \returns the reciprocal square root of \a x. **/ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T sqrt(const T &x) +T rsqrt(const T& x) { - EIGEN_USING_STD_MATH(sqrt); - return sqrt(x); + return internal::rsqrt_impl::run(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { - EIGEN_USING_STD_MATH(log); - return log(x); + return internal::log_impl::run(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) +#endif + + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1061,12 +1505,25 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits::Real abs(const T &x) { - EIGEN_USING_STD_MATH(abs); +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { + EIGEN_USING_STD(abs); return abs(x); } -#ifdef __CUDACC__ +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1087,26 +1544,69 @@ double abs(const std::complex& x) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T &x) { - EIGEN_USING_STD_MATH(exp); + EIGEN_USING_STD(exp); return exp(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + float com = ::expf(x.real()); + float res_real = com * ::cosf(x.imag()); + float res_imag = com * ::sinf(x.imag()); + return std::complex(res_real, res_imag); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + double com = ::exp(x.real()); + double res_real = com * ::cos(x.imag()); + double res_imag = com * ::sin(x.imag()); + return std::complex(res_real, res_imag); +} +#endif + +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x); +} + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) +#endif + +#if defined(EIGEN_GPUCC) +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float expm1(const float &x) { return ::expm1f(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double expm1(const double &x) { return ::expm1(x); } #endif template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T &x) { - EIGEN_USING_STD_MATH(cos); + EIGEN_USING_STD(cos); return cos(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1117,11 +1617,15 @@ double cos(const double &x) { return ::cos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T &x) { - EIGEN_USING_STD_MATH(sin); + EIGEN_USING_STD(sin); return sin(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1132,11 +1636,15 @@ double sin(const double &x) { return ::sin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T &x) { - EIGEN_USING_STD_MATH(tan); + EIGEN_USING_STD(tan); return tan(x); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1147,11 +1655,25 @@ double tan(const double &x) { return ::tan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T &x) { - EIGEN_USING_STD_MATH(acos); + EIGEN_USING_STD(acos); return acos(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acosh(const T &x) { + EIGEN_USING_STD(acosh); + return static_cast(acosh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1162,11 +1684,25 @@ double acos(const double &x) { return ::acos(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T &x) { - EIGEN_USING_STD_MATH(asin); + EIGEN_USING_STD(asin); return asin(x); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asinh(const T &x) { + EIGEN_USING_STD(asinh); + return static_cast(asinh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1177,11 +1713,25 @@ double asin(const double &x) { return ::asin(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T &x) { - EIGEN_USING_STD_MATH(atan); - return atan(x); + EIGEN_USING_STD(atan); + return static_cast(atan(x)); } -#ifdef __CUDACC__ +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atanh(const T &x) { + EIGEN_USING_STD(atanh); + return static_cast(atanh(x)); +} +#endif + +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1193,11 +1743,15 @@ double atan(const double &x) { return ::atan(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T &x) { - EIGEN_USING_STD_MATH(cosh); - return cosh(x); + EIGEN_USING_STD(cosh); + return static_cast(cosh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1208,11 +1762,15 @@ double cosh(const double &x) { return ::cosh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T &x) { - EIGEN_USING_STD_MATH(sinh); - return sinh(x); + EIGEN_USING_STD(sinh); + return static_cast(sinh(x)); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1223,16 +1781,20 @@ double sinh(const double &x) { return ::sinh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T &x) { - EIGEN_USING_STD_MATH(tanh); + EIGEN_USING_STD(tanh); return tanh(x); } -#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) +#endif + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1243,11 +1805,15 @@ double tanh(const double &x) { return ::tanh(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(fmod); + EIGEN_USING_STD(fmod); return fmod(a, b); } -#ifdef __CUDACC__ +#if defined(SYCL_DEVICE_ONLY) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) +#endif + +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { @@ -1261,6 +1827,23 @@ double fmod(const double& a, const double& b) { } #endif +#if defined(SYCL_DEVICE_ONLY) +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE +#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC +#undef SYCL_SPECIALIZE_UNARY_FUNC +#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC +#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC +#undef SYCL_SPECIALIZE_BINARY_FUNC +#endif + } // end namespace numext namespace internal { @@ -1384,18 +1967,23 @@ template<> struct random_impl { return random(0,1)==0 ? false : true; } + + static inline bool run(const bool& a, const bool& b) + { + return random(a, b)==0 ? false : true; + } }; template<> struct scalar_fuzzy_impl { typedef bool RealScalar; - + template EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) { return !x; } - + EIGEN_DEVICE_FUNC static inline bool isApprox(bool x, bool y, bool) { @@ -1407,10 +1995,61 @@ template<> struct scalar_fuzzy_impl { return (!x) || y; } - + +}; + +} // end namespace internal + +// Default implementations that rely on other numext implementations +namespace internal { + +// Specialization for complex types that are not supported by std::expm1. +template +struct expm1_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run( + const std::complex& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) + RealScalar xr = x.real(); + RealScalar xi = x.imag(); + // expm1(z) = exp(z) - 1 + // = exp(x + i * y) - 1 + // = exp(x) * (cos(y) + i * sin(y)) - 1 + // = exp(x) * cos(y) - 1 + i * exp(x) * sin(y) + // Imag(expm1(z)) = exp(x) * sin(y) + // Real(expm1(z)) = exp(x) * cos(y) - 1 + // = exp(x) * cos(y) - 1. + // = expm1(x) + exp(x) * (cos(y) - 1) + // = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2) + RealScalar erm1 = numext::expm1(xr); + RealScalar er = erm1 + RealScalar(1.); + RealScalar sin2 = numext::sin(xi / RealScalar(2.)); + sin2 = sin2 * sin2; + RealScalar s = numext::sin(xi); + RealScalar real_part = erm1 - RealScalar(2.) * er * sin2; + return std::complex(real_part, er * s); + } +}; + +template +struct rsqrt_impl { + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE T run(const T& x) { + return T(1)/numext::sqrt(x); + } +}; + +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl, true> +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(numext::real(x), -numext::imag(x)); + } }; +#endif - } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/MathFunctionsImpl.h b/externals/eigen/Eigen/src/Core/MathFunctionsImpl.h index 3c9ef22f..4eaaaa78 100644 --- a/externals/eigen/Eigen/src/Core/MathFunctionsImpl.h +++ b/externals/eigen/Eigen/src/Core/MathFunctionsImpl.h @@ -17,24 +17,28 @@ namespace internal { /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which - is accurate up to a couple of ulp in the range [-9, 9], outside of which - the tanh(x) = +/-1. + is accurate up to a couple of ulps in the (approximate) range [-8, 8], + outside of which tanh(x) = +/-1 in single precision. The input is clamped + to the range [-c, c]. The value c is chosen as the smallest value where + the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] + the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. This implementation works on both scalars and packets. */ template T generic_fast_tanh_float(const T& a_x) { - // Clamp the inputs to the range [-9, 9] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_9 = pset1(9.f); - const T minus_9 = pset1(-9.f); - // NOTE GCC prior to 6.3 might improperly optimize this max/min - // step such that if a_x is nan, x will be either 9 or -9, - // and tanh will return 1 or -1 instead of nan. - // This is supposed to be fixed in gcc6.3, - // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - const T x = pmax(minus_9,pmin(plus_9,a_x)); + // Clamp the inputs to the range [-c, c] +#ifdef EIGEN_VECTORIZE_FMA + const T plus_clamp = pset1(7.99881172180175781f); + const T minus_clamp = pset1(-7.99881172180175781f); +#else + const T plus_clamp = pset1(7.90531110763549805f); + const T minus_clamp = pset1(-7.90531110763549805f); +#endif + const T tiny = pset1(0.0004f); + const T x = pmax(pmin(a_x, plus_clamp), minus_clamp); + const T tiny_mask = pcmp_lt(pabs(a_x), tiny); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); @@ -62,13 +66,131 @@ T generic_fast_tanh_float(const T& a_x) p = pmadd(x2, p, alpha_1); p = pmul(x, p); - // Evaluate the denominator polynomial p. + // Evaluate the denominator polynomial q. T q = pmadd(x2, beta_6, beta_4); q = pmadd(x2, q, beta_2); q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pdiv(p, q); + return pselect(tiny_mask, x, pdiv(p, q)); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) +{ + // IEEE IEC 6059 special cases. + if ((numext::isinf)(x) || (numext::isinf)(y)) + return NumTraits::infinity(); + if ((numext::isnan)(x) || (numext::isnan)(y)) + return NumTraits::quiet_NaN(); + + EIGEN_USING_STD(sqrt); + RealScalar p, qp; + p = numext::maxi(x,y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(y,x) / p; + return p * sqrt(RealScalar(1) + qp*qp); +} + +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD(abs); + return positive_real_hypot(abs(x), abs(y)); + } +}; + +// Generic complex sqrt implementation that correctly handles corner cases +// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt +template +EIGEN_DEVICE_FUNC std::complex complex_sqrt(const std::complex& z) { + // Computes the principal sqrt of the input. + // + // For a complex square root of the number x + i*y. We want to find real + // numbers u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = y / (2 * u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = y / (2 * v) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w, v = sign(y) * w + // if x > 0: u = w, v = y / (2 * w) + // if x < 0: u = |y| / (2 * w), v = sign(y) * w + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y))); + + return + (numext::isinf)(y) ? std::complex(NumTraits::infinity(), y) + : x == zero ? std::complex(w, y < zero ? -w : w) + : x > zero ? std::complex(w, y / (2 * w)) + : std::complex(numext::abs(y) / (2 * w), y < zero ? -w : w ); +} + +// Generic complex rsqrt implementation. +template +EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { + // Computes the principal reciprocal sqrt of the input. + // + // For a complex reciprocal square root of the number z = x + i*y. We want to + // find real numbers u and v such that + // (u + i*v)^2 = 1 / (x + i*y) <=> + // u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x/|z|^2 + // 2*u*v = y/|z|^2. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + |z|)) / |z| + // v = -y / (2 * u * |z|) + // and for x < 0, + // v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z| + // u = -y / (2 * v * |z|) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w / |z|, v = -sign(y) * w / |z| + // if x > 0: u = w / |z|, v = -y / (2 * w * |z|) + // if x < 0: u = |y| / (2 * w * |z|), v = -sign(y) * w / |z| + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + + const T abs_z = numext::hypot(x, y); + const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z)); + const T woz = w / abs_z; + // Corner cases consistent with 1/sqrt(z) on gcc/clang. + return + abs_z == zero ? std::complex(NumTraits::infinity(), NumTraits::quiet_NaN()) + : ((numext::isinf)(x) || (numext::isinf)(y)) ? std::complex(zero, zero) + : x == zero ? std::complex(woz, y < zero ? woz : -woz) + : x > zero ? std::complex(woz, -y / (2 * w * abs_z)) + : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); +} + +template +EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z) { + // Computes complex log. + T a = numext::abs(z); + EIGEN_USING_STD(atan2); + T b = atan2(z.imag(), z.real()); + return std::complex(numext::log(a), b); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/Matrix.h b/externals/eigen/Eigen/src/Core/Matrix.h index 90c336d8..f0e59a91 100644 --- a/externals/eigen/Eigen/src/Core/Matrix.h +++ b/externals/eigen/Eigen/src/Core/Matrix.h @@ -29,7 +29,7 @@ struct traits > required_alignment = unpacket_traits::alignment, packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 }; - + public: typedef _Scalar Scalar; typedef Dense StorageKind; @@ -44,7 +44,7 @@ struct traits > Options = _Options, InnerStrideAtCompileTime = 1, OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, - + // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, Alignment = actual_alignment @@ -255,55 +255,93 @@ class Matrix * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix() : Base() + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix() : Base() { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } // FIXME is it still needed - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { - other.swap(*this); + Base::operator=(std::move(other)); return *this; } #endif - #ifndef EIGEN_PARSED_BY_DOXYGEN +#if EIGEN_HAS_CXX11 + /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) + * + * Example: \include Matrix_variadic_ctor_cxx11.cpp + * Output: \verbinclude Matrix_variadic_ctor_cxx11.out + * + * \sa Matrix(const std::initializer_list>&) + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : Base(a0, a1, a2, a3, args...) {} + + /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: + * + * Example: \include Matrix_initializer_list_23_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_23_cxx11.out + * + * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. + * + * In the case of a compile-time column vector, implicit transposition from a single row is allowed. + * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax + * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: + * + * Example: \include Matrix_initializer_list_vector_cxx11.cpp + * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out + * + * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, + * and implicit transposition is allowed for compile-time vectors only. + * + * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} +#endif // end EIGEN_HAS_CXX11 + +#ifndef EIGEN_PARSED_BY_DOXYGEN // This constructor is for both 1x1 matrices and dynamic vectors template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit Matrix(const T& x) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit Matrix(const T& x) { Base::_check_template_params(); Base::template _init1(x); } template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Matrix(const T0& x, const T1& y) { Base::_check_template_params(); Base::template _init2(x, y); } - #else + + +#else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Matrix(const Scalar *data); @@ -313,7 +351,7 @@ class Matrix * This is useful for dynamic-size vectors. For fixed-size vectors, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance, * calling Matrix(1) will call the initialization constructor: Matrix(const Scalar&). * For fixed-size \c 1x1 matrices it is therefore recommended to use the default @@ -321,14 +359,15 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * * This is useful for dynamic-size matrices. For fixed-size matrices, * it is redundant to pass these parameters, so one should use the default constructor * Matrix() instead. - * + * * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance, * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y). * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default @@ -337,12 +376,15 @@ class Matrix */ EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - - /** \brief Constructs an initialized 2D vector with given coefficients */ + + /** \brief Constructs an initialized 2D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ Matrix(const Scalar& x, const Scalar& y); - #endif + #endif // end EIGEN_PARSED_BY_DOXYGEN - /** \brief Constructs an initialized 3D vector with given coefficients */ + /** \brief Constructs an initialized 3D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -352,7 +394,9 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients */ + /** \brief Constructs an initialized 4D vector with given coefficients + * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) + */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { @@ -379,8 +423,10 @@ class Matrix : Base(other.derived()) { } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return 1; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return this->innerSize(); } /////////// Geometry module /////////// @@ -407,7 +453,7 @@ class Matrix * * \ingroup Core_Module * - * Eigen defines several typedef shortcuts for most common matrix and vector types. + * %Eigen defines several typedef shortcuts for most common matrix and vector types. * * The general patterns are the following: * @@ -420,6 +466,15 @@ class Matrix * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is * a fixed-size vector of 4 complex floats. * + * With \cpp11, template alias are also defined for common sizes. + * They follow the same pattern as above except that the scalar type suffix is replaced by a + * template parameter, i.e.: + * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. + * - `MatrixXSize` and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. + * - `VectorSize` and `RowVectorSize` for column and row vectors. + * + * With \cpp11, you can also use fully generic column and row vector types: `Vector` and `RowVector`. + * * \sa class Matrix */ @@ -456,6 +511,55 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS +#if EIGEN_HAS_CXX11 + +#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Vector##SizeSuffix = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using RowVector##SizeSuffix = Matrix; + +#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##Size##X = Matrix; \ +/** \ingroup matrixtypedefs */ \ +/** \brief \cpp11 */ \ +template \ +using Matrix##X##Size = Matrix; + +EIGEN_MAKE_TYPEDEFS(2, 2) +EIGEN_MAKE_TYPEDEFS(3, 3) +EIGEN_MAKE_TYPEDEFS(4, 4) +EIGEN_MAKE_TYPEDEFS(Dynamic, X) +EIGEN_MAKE_FIXED_TYPEDEFS(2) +EIGEN_MAKE_FIXED_TYPEDEFS(3) +EIGEN_MAKE_FIXED_TYPEDEFS(4) + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using Vector = Matrix; + +/** \ingroup matrixtypedefs + * \brief \cpp11 */ +template +using RowVector = Matrix; + +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS + +#endif // EIGEN_HAS_CXX11 + } // end namespace Eigen #endif // EIGEN_MATRIX_H diff --git a/externals/eigen/Eigen/src/Core/MatrixBase.h b/externals/eigen/Eigen/src/Core/MatrixBase.h index f7cf04cd..45c3a596 100644 --- a/externals/eigen/Eigen/src/Core/MatrixBase.h +++ b/externals/eigen/Eigen/src/Core/MatrixBase.h @@ -76,6 +76,7 @@ template class MatrixBase using Base::coeffRef; using Base::lazyAssign; using Base::eval; + using Base::operator-; using Base::operator+=; using Base::operator-=; using Base::operator*=; @@ -122,7 +123,6 @@ template class MatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseBinaryOps.h" @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product @@ -277,6 +268,8 @@ template class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -294,7 +287,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator!= */ template - inline bool operator==(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const { return cwiseEqual(other).all(); } /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. @@ -302,10 +295,10 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator== */ template - inline bool operator!=(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -335,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -344,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -421,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); @@ -437,8 +438,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// @@ -453,19 +456,33 @@ template class MatrixBase ///////// MatrixFunctions module ///////// typedef typename internal::stem_function::type StemFunction; - const MatrixExponentialReturnValue exp() const; +#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name() const; +#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name(Argument) const; + + EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential) + /** \brief Helper function for the unsupported MatrixFunctions module.*/ const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; - const MatrixFunctionReturnValue cosh() const; - const MatrixFunctionReturnValue sinh() const; - const MatrixFunctionReturnValue cos() const; - const MatrixFunctionReturnValue sin() const; - const MatrixSquareRootReturnValue sqrt() const; - const MatrixLogarithmReturnValue log() const; - const MatrixPowerReturnValue pow(const RealScalar& p) const; - const MatrixComplexPowerReturnValue pow(const std::complex& p) const; + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) +#if EIGEN_HAS_CXX11_MATH + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) +#endif + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) + EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) + EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm) + EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p) + EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex& p) protected: - EIGEN_DEVICE_FUNC MatrixBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase) private: EIGEN_DEVICE_FUNC explicit MatrixBase(int); diff --git a/externals/eigen/Eigen/src/Core/NestByValue.h b/externals/eigen/Eigen/src/Core/NestByValue.h index 13adf070..b4275768 100644 --- a/externals/eigen/Eigen/src/Core/NestByValue.h +++ b/externals/eigen/Eigen/src/Core/NestByValue.h @@ -16,7 +16,11 @@ namespace Eigen { namespace internal { template struct traits > : public traits -{}; +{ + enum { + Flags = traits::Flags & ~NestByRefBit + }; +}; } /** \class NestByValue @@ -41,57 +45,13 @@ template class NestByValue EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } - EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const - { - return m_expression.coeff(row, col); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) - { - return m_expression.const_cast_derived().coeffRef(row, col); - } - - EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const - { - return m_expression.coeff(index); - } - - EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) - { - return m_expression.const_cast_derived().coeffRef(index); - } - - template - inline const PacketScalar packet(Index row, Index col) const - { - return m_expression.template packet(row, col); - } - - template - inline void writePacket(Index row, Index col, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(row, col, x); - } - - template - inline const PacketScalar packet(Index index) const - { - return m_expression.template packet(index); - } - - template - inline void writePacket(Index index, const PacketScalar& x) - { - m_expression.const_cast_derived().template writePacket(index, x); - } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_expression.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_expression.cols(); } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } + EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } + protected: const ExpressionType m_expression; }; @@ -99,12 +59,27 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); } +namespace internal { + +// Evaluator of Solve -> eval into a temporary +template +struct evaluator > + : public evaluator +{ + typedef evaluator Base; + + EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue& xpr) + : Base(xpr.nestedExpression()) + {} +}; +} + } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/externals/eigen/Eigen/src/Core/NoAlias.h b/externals/eigen/Eigen/src/Core/NoAlias.h index 33908010..570283d9 100644 --- a/externals/eigen/Eigen/src/Core/NoAlias.h +++ b/externals/eigen/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template @@ -74,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; @@ -98,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias MatrixBase::noalias() +NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() { return NoAlias(derived()); } diff --git a/externals/eigen/Eigen/src/Core/NumTraits.h b/externals/eigen/Eigen/src/Core/NumTraits.h index dd61195b..72eac5a9 100644 --- a/externals/eigen/Eigen/src/Core/NumTraits.h +++ b/externals/eigen/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { using std::log10; using std::ceil; @@ -38,11 +40,64 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return 0; } +}; + + +// default implementation of digits(), based on numeric_limits if specialized, +// 0 for integer types, and log2(epsilon()) otherwise. +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits_impl +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { return std::numeric_limits::digits; } +}; + +template +struct default_digits_impl // Floating point +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static int run() { + using std::log; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + } +}; + +template +struct default_digits_impl // Integer +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int run() { return 0; } }; } // end namespace internal +namespace numext { +/** \internal bit-wise cast without changing the underlying bit representation. */ + +// TODO: Replace by std::bit_cast (available in C++20) +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { +#if EIGEN_HAS_TYPE_TRAITS + // The behaviour of memcpy is not specified for non-trivially copyable types + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value, THIS_TYPE_IS_NOT_SUPPORTED); + EIGEN_STATIC_ASSERT(std::is_trivially_copyable::value && std::is_default_constructible::value, + THIS_TYPE_IS_NOT_SUPPORTED); +#endif + + EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); + Tgt tgt; + EIGEN_USING_STD(memcpy) + memcpy(&tgt, &src, sizeof(Tgt)); + return tgt; +} +} // namespace numext + /** \class NumTraits * \ingroup Core_Module * @@ -71,7 +126,7 @@ struct default_digits10_impl // Integer * and to \c 0 otherwise. * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed * to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers. - * Stay vague here. No need to do architecture-specific stuff. + * Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost. * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must * be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise. @@ -80,9 +135,18 @@ struct default_digits10_impl // Integer * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default * value by the fuzzy comparison operators. * \li highest() and lowest() functions returning the highest and lowest possible values respectively. + * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is + * the analogue of std::numeric_limits::digits + * which is used as the default implementation if specialized. * \li digits10() function returning the number of decimal digits that can be represented without change. This is * the analogue of std::numeric_limits::digits10 * which is used as the default implementation if specialized. + * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively, + * such that the radix raised to the power exponent-1 is a normalized floating-point number. These are equivalent to + * std::numeric_limits::min_exponent/ + * std::numeric_limits::max_exponent. + * \li infinity() function returning a representation of positive infinity, if available. + * \li quiet_NaN function returning a non-signaling "not-a-number", if available. */ template struct GenericNumTraits @@ -106,42 +170,60 @@ template struct GenericNumTraits typedef T Nested; typedef T Literal; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return numext::numeric_limits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return internal::default_digits10_impl::run(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int digits() + { + return internal::default_digits_impl::run(); + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int min_exponent() + { + return numext::numeric_limits::min_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline int max_exponent() + { + return numext::numeric_limits::max_exponent; + } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { // make sure to override this for floating-point types return Real(0); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T highest() { return (numext::numeric_limits::max)(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T lowest() { - return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); + return IsInteger ? (numext::numeric_limits::min)() + : static_cast(-(numext::numeric_limits::max)()); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T infinity() { return numext::numeric_limits::infinity(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline T quiet_NaN() { return numext::numeric_limits::quiet_NaN(); } @@ -153,19 +235,20 @@ template struct NumTraits : GenericNumTraits template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline float dummy_precision() { return 1e-5f; } }; template<> struct NumTraits : GenericNumTraits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline double dummy_precision() { return 1e-12; } }; template<> struct NumTraits : GenericNumTraits { + EIGEN_CONSTEXPR static inline long double dummy_precision() { return 1e-15l; } }; @@ -182,11 +265,11 @@ template struct NumTraits > MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline Real dummy_precision() { return NumTraits::dummy_precision(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline int digits10() { return NumTraits::digits10(); } }; @@ -206,15 +289,18 @@ struct NumTraits > IsInteger = NumTraits::IsInteger, IsSigned = NumTraits::IsSigned, RequireInitialization = 1, - ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, - AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, - MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::ReadCost), + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::AddCost), + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::MulCost) }; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + + EIGEN_CONSTEXPR + static inline int digits10() { return NumTraits::digits10(); } }; template<> struct NumTraits @@ -227,6 +313,7 @@ template<> struct NumTraits MulCost = HugeCost }; + EIGEN_CONSTEXPR static inline int digits10() { return 0; } private: @@ -241,6 +328,8 @@ template<> struct NumTraits // Empty specialization for void to allow template specialization based on NumTraits::Real with T==void and SFINAE. template<> struct NumTraits {}; +template<> struct NumTraits : GenericNumTraits {}; + } // end namespace Eigen #endif // EIGEN_NUMTRAITS_H diff --git a/externals/eigen/Eigen/src/Core/PartialReduxEvaluator.h b/externals/eigen/Eigen/src/Core/PartialReduxEvaluator.h new file mode 100644 index 00000000..29abf35b --- /dev/null +++ b/externals/eigen/Eigen/src/Core/PartialReduxEvaluator.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PARTIALREDUX_H +#define EIGEN_PARTIALREDUX_H + +namespace Eigen { + +namespace internal { + + +/*************************************************************************** +* +* This file provides evaluators for partial reductions. +* There are two modes: +* +* - scalar path: simply calls the respective function on the column or row. +* -> nothing special here, all the tricky part is handled by the return +* types of VectorwiseOp's members. They embed the functor calling the +* respective DenseBase's member function. +* +* - vectorized path: implements a packet-wise reductions followed by +* some (optional) processing of the outcome, e.g., division by n for mean. +* +* For the vectorized path let's observe that the packet-size and outer-unrolling +* are both decided by the assignement logic. So all we have to do is to decide +* on the inner unrolling. +* +* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, +* but be need to be careful to specify correct increment. +* +***************************************************************************/ + + +/* logic deciding a strategy for unrolling of vectorized paths */ +template +struct packetwise_redux_traits +{ + enum { + OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, + Cost = OuterSize == Dynamic ? HugeCost + : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, + Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling + }; + +}; + +/* Value to be returned when size==0 , by default let's return 0 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } + +/* For products the default is 1 */ +template +EIGEN_DEVICE_FUNC +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } + +/* Perform the actual reduction */ +template::Unrolling +> +struct packetwise_redux_impl; + +/* Perform the actual reduction with unrolling */ +template +struct packetwise_redux_impl +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) + { + return redux_vec_unroller::OuterSize>::template run(eval,func); + } +}; + +/* Add a specialization of redux_vec_unroller for size==0 at compiletime. + * This specialization is not required for general reductions, which is + * why it is defined here. + */ +template +struct redux_vec_unroller +{ + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) + { + return packetwise_redux_empty_value(f); + } +}; + +/* Perform the actual reduction for dynamic sizes */ +template +struct packetwise_redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC + static PacketType run(const Evaluator &eval, const Func& func, Index size) + { + if(size==0) + return packetwise_redux_empty_value(func); + + const Index size4 = (size-1)&(~3); + PacketType p = eval.template packetByOuterInner(0,0); + Index i = 1; + // This loop is optimized for instruction pipelining: + // - each iteration generates two independent instructions + // - thanks to branch prediction and out-of-order execution we have independent instructions across loops + for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), + func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); + for(; i(i,0)); + return p; + } +}; + +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize==0 ? 1 + : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), + + _ArgFlags = evaluator::Flags, + + _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) + && bool(MemberOp::Vectorizable) + && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) + && (TraversalSize!=0), + + Flags = (traits::Flags&RowMajorBit) + | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) + | (_Vectorizable ? PacketAccessBit : 0) + | LinearAccessBit, + + Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized + }; + + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + return coeff(Direction==Vertical ? j : i); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + return m_functor(m_arg.template subVector(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketType packet(Index i, Index j) const + { + return packet(Direction==Vertical ? j : i); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + PacketType packet(Index idx) const + { + enum { PacketSize = internal::unpacket_traits::size }; + typedef Block PanelType; + + PanelType panel(m_arg, + Direction==Vertical ? 0 : idx, + Direction==Vertical ? idx : 0, + Direction==Vertical ? m_arg.rows() : Index(PacketSize), + Direction==Vertical ? Index(PacketSize) : m_arg.cols()); + + // FIXME + // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of panel get reversed + // and methods like packetByOuterInner do not make sense anymore in this context. + // So let's just by pass "vectorization" in this case: + if(PacketSize==1) + return internal::pset1(coeff(idx)); + + typedef typename internal::redux_evaluator PanelEvaluator; + PanelEvaluator panel_eval(panel); + typedef typename MemberOp::BinaryOp BinaryOp; + PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); + return p; + } + +protected: + ConstArgTypeNested m_arg; + const MemberOp m_functor; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PARTIALREDUX_H diff --git a/externals/eigen/Eigen/src/Core/PermutationMatrix.h b/externals/eigen/Eigen/src/Core/PermutationMatrix.h index b1fb455b..69401bf4 100644 --- a/externals/eigen/Eigen/src/Core/PermutationMatrix.h +++ b/externals/eigen/Eigen/src/Core/PermutationMatrix.h @@ -87,25 +87,14 @@ class PermutationBase : public EigenBase return derived(); } - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** This is a special case of the templated operator=. Its purpose is to - * prevent a default operator= from hiding the templated operator=. - */ - Derived& operator=(const PermutationBase& other) - { - indices() = other.indices(); - return derived(); - } - #endif - /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template @@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase& other) : m_indices(other.indices()) {} - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** Standard copy constructor. Defined only to prevent a default copy constructor - * from hiding the other templated constructor */ - inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {} - #endif - /** Generic constructor from expression of the indices. The indices * array has the meaning that the permutations sends each integer i to indices[i]. * @@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase::quiet_NaN(); +# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(Index i=0;i::quiet_NaN(); #else # undef EIGEN_INITIALIZE_COEFFS # define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED @@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Scalar Scalar; - + typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; typedef Derived DenseType; @@ -118,16 +118,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::IsVectorAtCompileTime; using Base::Flags; - template friend class Eigen::Map; - friend class Eigen::Map; typedef Eigen::Map MapType; - friend class Eigen::Map; typedef const Eigen::Map ConstMapType; -#if EIGEN_MAX_ALIGN_BYTES>0 - // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. - friend class Eigen::Map; - friend class Eigen::Map; -#endif typedef Eigen::Map AlignedMapType; typedef const Eigen::Map ConstAlignedMapType; template struct StridedMapType { typedef Eigen::Map type; }; @@ -147,10 +139,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_storage.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_storage.cols(); } /** This is an overloaded version of DenseCoeffsBase::coeff(Index,Index) const * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts. @@ -358,7 +350,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase& _other) { const OtherDerived& other = _other.derived(); @@ -383,7 +375,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will be uninitialized. */ EIGEN_DEVICE_FUNC @@ -440,7 +432,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will copied from \c other. */ template @@ -508,8 +500,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT { - using std::swap; - swap(m_storage, other.m_storage); + _check_template_params(); + m_storage = std::move(other.m_storage); return *this; } #endif @@ -526,6 +518,71 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } + #if EIGEN_HAS_CXX11 + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. + */ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + : m_storage() + { + _check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); + m_storage.data()[0] = a0; + m_storage.data()[1] = a1; + m_storage.data()[2] = a2; + m_storage.data()[3] = a3; + Index i = 4; + auto x = {(m_storage.data()[i++] = args, 0)...}; + static_cast(x); + } + + /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer + * lists \cpp11 + */ + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) + : m_storage() + { + _check_template_params(); + + size_t list_size = 0; + if (list.begin() != list.end()) { + list_size = list.begin()->size(); + } + + // This is to allow syntax like VectorXi {{1, 2, 3, 4}} + if (ColsAtCompileTime == 1 && list.size() == 1) { + eigen_assert(list_size == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + resize(list_size, ColsAtCompileTime); + std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); + } else { + eigen_assert(list.size() == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); + eigen_assert(list_size == static_cast(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); + resize(list.size(), list_size); + + Index row_index = 0; + for (const std::initializer_list& row : list) { + eigen_assert(list_size == row.size()); + Index col_index = 0; + for (const Scalar& e : row) { + coeffRef(row_index, col_index) = e; + ++col_index; + } + ++row_index; + } + } + } + #endif // end EIGEN_HAS_CXX11 + /** \sa PlainObjectBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC @@ -564,7 +621,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \copydetails DenseBase::operator=(const EigenBase &other) */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase &other) { _resize_to_match(other); @@ -577,6 +634,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ @@ -648,18 +709,26 @@ class PlainObjectBase : public internal::dense_xpr_base::type using Base::setConstant; EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val); using Base::setZero; EIGEN_DEVICE_FUNC Derived& setZero(Index size); EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t); using Base::setOnes; EIGEN_DEVICE_FUNC Derived& setOnes(Index size); EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols); + EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t); using Base::setRandom; Derived& setRandom(Index size); Derived& setRandom(Index rows, Index cols); + Derived& setRandom(NoChange_t, Index cols); + Derived& setRandom(Index rows, NoChange_t); #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN #include EIGEN_PLAINOBJECTBASE_PLUGIN @@ -674,7 +743,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase& other) { #ifdef EIGEN_NO_AUTOMATIC_RESIZING @@ -701,10 +770,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \internal */ - // aliasing is dealt once in internall::call_assignment + // aliasing is dealt once in internal::call_assignment // so at this stage we have to assume aliasing... and resising has to be done later. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set(const DenseBase& other) { internal::call_assignment(this->derived(), other.derived()); @@ -717,7 +786,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \sa operator=(const MatrixBase&), _set() */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase& other) { // I don't think we need this resize call since the lazyAssign will anyways resize @@ -733,23 +802,25 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) { - EIGEN_STATIC_ASSERT(bool(NumTraits::IsInteger) && - bool(NumTraits::IsInteger), + const bool t0_is_integer_alike = internal::is_valid_index_type::value; + const bool t1_is_integer_alike = internal::is_valid_index_type::value; + EIGEN_STATIC_ASSERT(t0_is_integer_alike && + t1_is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); m_storage.data()[1] = Scalar(val1); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, typename internal::enable_if< (!internal::is_same::value) && (internal::is_same::value) @@ -769,14 +840,14 @@ class PlainObjectBase : public internal::dense_xpr_base::type && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) { // NOTE MSVC 2008 complains if we directly put bool(NumTraits::IsInteger) as the EIGEN_STATIC_ASSERT argument. - const bool is_integer = NumTraits::IsInteger; - EIGEN_UNUSED_VARIABLE(is_integer); - EIGEN_STATIC_ASSERT(is_integer, + const bool is_integer_alike = internal::is_valid_index_type::value; + EIGEN_UNUSED_VARIABLE(is_integer_alike); + EIGEN_STATIC_ASSERT(is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } - - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) @@ -784,7 +855,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = val0; } - + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type) template EIGEN_DEVICE_FUNC @@ -840,7 +911,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { this->derived() = r; } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -852,7 +923,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -866,38 +937,38 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + template friend struct internal::matrix_swap_impl; public: - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal * \brief Override DenseBase::swap() since for dynamic-sized matrices * of same type it is enough to swap the data pointers. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase & other) { enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; internal::matrix_swap_impl::run(this->derived(), other.derived()); } - + /** \internal * \brief const version forwarded to DenseBase::swap */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase const & other) { Base::swap(other.derived()); } - - EIGEN_DEVICE_FUNC + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void _check_template_params() { - EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor) - && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0) + EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (int(Options)&RowMajor)==RowMajor) + && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (int(Options)&RowMajor)==0) && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0)) && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0)) && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0)) @@ -909,6 +980,17 @@ class PlainObjectBase : public internal::dense_xpr_base::type } enum { IsPlainObjectBase = 1 }; +#endif + public: + // These apparently need to be down here for nvcc+icc to prevent duplicate + // Map symbol. + template friend class Eigen::Map; + friend class Eigen::Map; + friend class Eigen::Map; +#if EIGEN_MAX_ALIGN_BYTES>0 + // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice. + friend class Eigen::Map; + friend class Eigen::Map; #endif }; @@ -917,13 +999,19 @@ namespace internal { template struct conservative_resize_like_impl { + #if EIGEN_HAS_TYPE_TRAITS + static const bool IsRelocatable = std::is_trivially_copyable::value; + #else + static const bool IsRelocatable = !NumTraits::RequireInitialization; + #endif static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) - if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns + if ( IsRelocatable + && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns { internal::check_rows_cols_for_overflow::run(rows, cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); @@ -931,7 +1019,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(rows,cols); + Derived tmp(rows,cols); const Index common_rows = numext::mini(rows, _this.rows()); const Index common_cols = numext::mini(cols, _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -951,8 +1039,9 @@ struct conservative_resize_like_impl EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) - if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns + if ( IsRelocatable && + (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns { const Index new_rows = other.rows() - _this.rows(); const Index new_cols = other.cols() - _this.cols(); @@ -965,7 +1054,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(other); + Derived tmp(other); const Index common_rows = numext::mini(tmp.rows(), _this.rows()); const Index common_cols = numext::mini(tmp.cols(), _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -980,13 +1069,18 @@ template struct conservative_resize_like_impl : conservative_resize_like_impl { - using conservative_resize_like_impl::run; - + typedef conservative_resize_like_impl Base; + using Base::run; + using Base::IsRelocatable; + static void run(DenseBase& _this, Index size) { const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; - _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); } static void run(DenseBase& _this, const DenseBase& other) @@ -997,7 +1091,10 @@ struct conservative_resize_like_impl const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; - _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements); @@ -1008,7 +1105,7 @@ template struct matrix_swap_impl { EIGEN_DEVICE_FUNC - static inline void run(MatrixTypeA& a, MatrixTypeB& b) + static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); } diff --git a/externals/eigen/Eigen/src/Core/Product.h b/externals/eigen/Eigen/src/Core/Product.h index ae0c94b3..70a6c106 100644 --- a/externals/eigen/Eigen/src/Core/Product.h +++ b/externals/eigen/Eigen/src/Core/Product.h @@ -23,25 +23,25 @@ struct traits > typedef typename remove_all::type RhsCleaned; typedef traits LhsTraits; typedef traits RhsTraits; - + typedef MatrixXpr XprKind; - + typedef typename ScalarBinaryOpTraits::Scalar, typename traits::Scalar>::ReturnType Scalar; typedef typename product_promote_storage_type::ret>::ret StorageKind; typedef typename promote_index_type::type StorageIndex; - + enum { RowsAtCompileTime = LhsTraits::RowsAtCompileTime, ColsAtCompileTime = RhsTraits::ColsAtCompileTime, MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime, MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime, - + // FIXME: only needed by GeneralMatrixMatrixTriangular InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), - + // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator. Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 @@ -74,10 +74,10 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, internal::product_type<_Lhs,_Rhs>::ret>::ret> { public: - + typedef _Lhs Lhs; typedef _Rhs Rhs; - + typedef typename ProductImpl< Lhs, Rhs, Option, typename internal::product_promote_storage_type::StorageKind, @@ -90,18 +90,23 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; - EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } - EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } - EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const RhsNestedCleaned& rhs() const { return m_rhs; } protected: @@ -110,13 +115,13 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, }; namespace internal { - + template::ret> class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type @@ -126,8 +131,8 @@ class dense_product_base public: using Base::derived; typedef typename Base::Scalar Scalar; - - operator const Scalar() const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -148,37 +153,37 @@ class ProductImpl : public internal::dense_product_base { typedef Product Derived; - + public: - + typedef typename internal::dense_product_base Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) protected: enum { - IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && + IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic), EnableCoeff = IsOneByOne || Option==LazyProduct }; - + public: - - EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(row,col); } - EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); - + return internal::evaluator(derived()).coeff(i); } - - + + }; } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/ProductEvaluators.h b/externals/eigen/Eigen/src/Core/ProductEvaluators.h index 583b7f59..8cf294b2 100644 --- a/externals/eigen/Eigen/src/Core/ProductEvaluators.h +++ b/externals/eigen/Eigen/src/Core/ProductEvaluators.h @@ -14,27 +14,27 @@ #define EIGEN_PRODUCTEVALUATORS_H namespace Eigen { - + namespace internal { /** \internal * Evaluator of a product expression. * Since products require special treatments to handle all possible cases, - * we simply deffer the evaluation logic to a product_evaluator class + * we simply defer the evaluation logic to a product_evaluator class * which offers more partial specialization possibilities. - * + * * \sa class product_evaluator */ template -struct evaluator > +struct evaluator > : public product_evaluator > { typedef Product XprType; typedef product_evaluator Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; - + // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" // TODO we should apply that rule only if that's really helpful template @@ -55,20 +55,20 @@ struct evaluator, const Product > XprType; typedef evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {} }; template -struct evaluator, DiagIndex> > +struct evaluator, DiagIndex> > : public evaluator, DiagIndex> > { typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index() )) @@ -108,27 +108,27 @@ struct product_evaluator, ProductTag, LhsShape, RhsSh : m_result(xpr.rows(), xpr.cols()) { ::new (static_cast(this)) Base(m_result); - + // FIXME shall we handle nested_eval here?, // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.) // typedef typename internal::nested_eval::type LhsNested; // typedef typename internal::nested_eval::type RhsNested; // typedef typename internal::remove_all::type LhsNestedCleaned; // typedef typename internal::remove_all::type RhsNestedCleaned; -// +// // const LhsNested lhs(xpr.lhs()); // const RhsNested rhs(xpr.rhs()); -// +// // generic_product_impl::evalTo(m_result, lhs, rhs); generic_product_impl::evalTo(m_result, xpr.lhs(), xpr.rhs()); } - -protected: + +protected: PlainObject m_result; }; -// The following three shortcuts are enabled only if the scalar types match excatly. +// The following three shortcuts are enabled only if the scalar types match exactly. // TODO: we could enable them for different scalar types when the product is not vectorized. // Dense = Product @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -207,11 +207,17 @@ struct evaluator_assume_aliasing +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; + template struct assignment_from_xpr_op_product { template - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -240,19 +246,19 @@ template struct generic_product_impl { template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } - + template - static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -263,10 +269,10 @@ struct generic_product_impl // Column major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const // Row major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -292,43 +298,43 @@ struct generic_product_impl { template struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; typedef typename Product::Scalar Scalar; - + // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void operator()(const Dst& dst, const Src& src) const { + template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; - + template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } - + template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } - + template - static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } - + template - static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } - + }; @@ -337,21 +343,21 @@ template struct generic_product_impl_base { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } - + template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } }; @@ -367,8 +373,13 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { + // Fallback to inner product if both the lhs and rhs is a runtime vector. + if (lhs.rows() == 1 && rhs.cols() == 1) { + dst.coeffRef(0,0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0)); + return; + } LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); internal::gemv_dense_selector }; template -struct generic_product_impl +struct generic_product_impl { typedef typename Product::Scalar Scalar; - + template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } - + template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } - -// template -// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) -// { dst.noalias() += alpha * lhs.lazyProduct(rhs); } + + // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance: + // dst {,+,-}= (s1*A)*(B*s2) + // will be rewritten as: + // dst {,+,-}= (s1*s2) * (A.lazyProduct(B)) + // There are at least four benefits of doing so: + // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations. + // 2 - it is faster than simply by-passing the heap allocation through stack allocation. + // 3 - it makes this fallback consistent with the heavy GEMM routine. + // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. + // (see https://stackoverflow.com/questions/54738495) + // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently + // enabled only when falling back from the main GEMM. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) + { + enum { + HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor, + ConjLhs = blas_traits::NeedToConjugate, + ConjRhs = blas_traits::NeedToConjugate + }; + // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto + // this is important for real*complex_mat + Scalar actualAlpha = combine_scalar_factors(lhs, rhs); + + eval_dynamic_impl(dst, + blas_traits::extract(lhs).template conjugateIf(), + blas_traits::extract(rhs).template conjugateIf(), + func, + actualAlpha, + typename conditional::type()); + } + +protected: + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) + { + EIGEN_UNUSED_VARIABLE(s); + eigen_internal_assert(s==Scalar(1)); + call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type) + { + call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func); + } }; // This specialization enforces the use of a coefficient-based evaluation strategy @@ -465,7 +525,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename internal::nested_eval::type LhsNested; typedef typename internal::nested_eval::type RhsNested; - + typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; @@ -484,19 +544,19 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename find_best_packet::type RhsVecPacketType; enum { - + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, RhsCoeffReadCost = RhsEtorType::CoeffReadCost, CoeffReadCost = InnerSize==0 ? NumTraits::ReadCost : InnerSize == Dynamic ? HugeCost - : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + : InnerSize * (NumTraits::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) + (InnerSize - 1) * NumTraits::AddCost, Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - + LhsFlags = LhsEtorType::Flags, RhsFlags = RhsEtorType::Flags, - + LhsRowMajor = LhsFlags & RowMajorBit, RhsRowMajor = RhsFlags & RowMajorBit, @@ -506,7 +566,7 @@ struct product_evaluator, ProductTag, DenseShape, // Here, we don't care about alignment larger than the usable packet size. LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), - + SameType = is_same::value, CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1), @@ -516,12 +576,12 @@ struct product_evaluator, ProductTag, DenseShape, : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : (bool(RhsRowMajor) && !CanVectorizeLhs), - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) | (EvalToRowMajor ? RowMajorBit : 0) // TODO enable vectorization for mixed types | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0), - + LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), @@ -537,10 +597,10 @@ struct product_evaluator, ProductTag, DenseShape, CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) - && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (InnerSize % packet_traits::size == 0) + && (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) + && (int(InnerSize) % packet_traits::size == 0) }; - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const { return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum(); @@ -550,7 +610,8 @@ struct product_evaluator, ProductTag, DenseShape, * which is why we don't set the LinearAccessBit. * TODO: this seems possible when the result is a vector */ - EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const CoeffReturnType coeff(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; @@ -558,6 +619,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const { PacketType res; @@ -569,6 +631,7 @@ struct product_evaluator, ProductTag, DenseShape, } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; @@ -579,7 +642,7 @@ struct product_evaluator, ProductTag, DenseShape, protected: typename internal::add_const_on_value_type::type m_lhs; typename internal::add_const_on_value_type::type m_rhs; - + LhsEtorType m_lhsImpl; RhsEtorType m_rhsImpl; @@ -597,7 +660,8 @@ struct product_evaluator, LazyCoeffBasedProduc enum { Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) : Base(BaseProduct(xpr.lhs(),xpr.rhs())) {} }; @@ -609,7 +673,7 @@ struct product_evaluator, LazyCoeffBasedProduc template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet(Index(UnrollingIndex-1), col), res); @@ -619,7 +683,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) { etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); res = pmadd(lhs.template packet(row, Index(UnrollingIndex-1)), pset1(rhs.coeff(Index(UnrollingIndex-1), col)), res); @@ -629,7 +693,7 @@ struct etor_product_packet_impl struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(pset1(lhs.coeff(row, Index(0))),rhs.template packet(Index(0), col)); } @@ -638,7 +702,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res) { res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); } @@ -647,7 +711,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -656,7 +720,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res) { res = pset1(typename unpacket_traits::type(0)); } @@ -665,7 +729,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -676,7 +740,7 @@ struct etor_product_packet_impl template struct etor_product_packet_impl { - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { res = pset1(typename unpacket_traits::type(0)); for(Index i = 0; i < innerDim; ++i) @@ -698,7 +762,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -712,7 +776,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -733,9 +797,10 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -746,7 +811,7 @@ struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; - + template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { @@ -758,7 +823,7 @@ struct generic_product_impl /*************************************************************************** * Diagonal products ***************************************************************************/ - + template struct diagonal_product_evaluator_base : evaluator_base @@ -766,34 +831,49 @@ struct diagonal_product_evaluator_base typedef typename ScalarBinaryOpTraits::ReturnType Scalar; public: enum { - CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, - + CoeffReadCost = int(NumTraits::MulCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost), + MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, - _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + + _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor + : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor + : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), + _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), _SameTypes = is_same::value, // FIXME currently we need same types, but in the future the next rule should be the one //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) + && _SameTypes + && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) + && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; - - diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) + + EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) : m_diagImpl(diag), m_matImpl(mat) { EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } - + protected: template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const @@ -801,7 +881,7 @@ struct diagonal_product_evaluator_base return internal::pmul(m_matImpl.template packet(row, col), internal::pset1(m_diagImpl.coeff(id))); } - + template EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const { @@ -812,7 +892,7 @@ struct diagonal_product_evaluator_base return internal::pmul(m_matImpl.template packet(row, col), m_diagImpl.template packet(id)); } - + evaluator m_diagImpl; evaluator m_matImpl; }; @@ -827,25 +907,25 @@ struct product_evaluator, ProductTag, DiagonalSha using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { - StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor - }; + typedef typename Lhs::DiagonalVectorType DiagonalType; + + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -854,7 +934,7 @@ struct product_evaluator, ProductTag, DiagonalSha return this->template packet_impl(row,col, row, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -873,30 +953,30 @@ struct product_evaluator, ProductTag, DenseShape, using Base::m_matImpl; using Base::coeff; typedef typename Base::Scalar Scalar; - + typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - - enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; + + enum { StorageOrder = Base::_StorageOrder }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) { } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const { return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } - -#ifndef __CUDACC__ + +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { return this->template packet_impl(row,col, col, typename internal::conditional::type()); } - + template EIGEN_STRONG_INLINE PacketType packet(Index idx) const { @@ -924,7 +1004,7 @@ struct permutation_matrix_product typedef typename remove_all::type MatrixTypeCleaned; template - static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { MatrixType mat(xpr); const Index n = Side==OnTheLeft ? mat.rows() : mat.cols(); @@ -978,7 +1058,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs, rhs); } @@ -988,7 +1068,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, rhs, lhs); } @@ -998,7 +1078,7 @@ template struct generic_product_impl, Rhs, PermutationShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1008,7 +1088,7 @@ template struct generic_product_impl, MatrixShape, PermutationShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) { permutation_matrix_product::run(dst, rhs.nestedExpression(), lhs); } @@ -1030,9 +1110,9 @@ struct transposition_matrix_product { typedef typename nested_eval::type MatrixType; typedef typename remove_all::type MatrixTypeCleaned; - + template - static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) { MatrixType mat(xpr); typedef typename TranspositionType::StorageIndex StorageIndex; @@ -1055,7 +1135,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs, rhs); } @@ -1065,7 +1145,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, rhs, lhs); } @@ -1076,7 +1156,7 @@ template struct generic_product_impl, Rhs, TranspositionsShape, MatrixShape, ProductTag> { template - static void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) { transposition_matrix_product::run(dst, lhs.nestedExpression(), rhs); } @@ -1086,7 +1166,7 @@ template struct generic_product_impl, MatrixShape, TranspositionsShape, ProductTag> { template - static void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) { transposition_matrix_product::run(dst, rhs.nestedExpression(), lhs); } diff --git a/externals/eigen/Eigen/src/Core/Random.h b/externals/eigen/Eigen/src/Core/Random.h index 6faf789c..dab2ac8e 100644 --- a/externals/eigen/Eigen/src/Core/Random.h +++ b/externals/eigen/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } @@ -177,6 +177,42 @@ PlainObjectBase::setRandom(Index rows, Index cols) return setRandom(); } +/** Resizes to the given size, changing only the number of columns, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(NoChange_t, Index cols) +{ + return setRandom(rows(), cols); +} + +/** Resizes to the given size, changing only the number of rows, and sets all + * coefficients in this expression to random values. For the parameter of type + * NoChange_t, just pass the special value \c NoChange. + * + * Numbers are uniformly spread through their whole definition range for integer types, + * and in the [-1:1] range for floating point scalar types. + * + * \not_reentrant + * + * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random() + */ +template +EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setRandom(Index rows, NoChange_t) +{ + return setRandom(rows, cols()); +} + } // end namespace Eigen #endif // EIGEN_RANDOM_H diff --git a/externals/eigen/Eigen/src/Core/Redux.h b/externals/eigen/Eigen/src/Core/Redux.h index b6e8f888..b6790d11 100644 --- a/externals/eigen/Eigen/src/Core/Redux.h +++ b/externals/eigen/Eigen/src/Core/Redux.h @@ -23,23 +23,29 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime, + OuterMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxRowsAtCompileTime + : Evaluator::MaxColsAtCompileTime, + SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic + : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0) + : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize) }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), + MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3) }; public: @@ -51,8 +57,8 @@ struct redux_traits public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,18 +70,20 @@ struct redux_traits #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(OuterMaxSize) + EIGEN_DEBUG_VAR(SliceVectorizedWork) EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) - EIGEN_DEBUG_VAR(Traversal) + std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(UnrollingLimit) - EIGEN_DEBUG_VAR(Unrolling) + std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; std::cerr << std::endl; } #endif @@ -87,88 +95,86 @@ struct redux_traits /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); + return func(redux_novec_unroller::run(eval,func), + redux_novec_unroller::run(eval,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { - enum { - PacketSize = redux_traits::PacketSize, - HalfLength = Length/2 - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func) { + enum { + PacketSize = unpacket_traits::size, + HalfLength = Length/2 + }; + return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); + redux_vec_unroller::template run(eval,func), + redux_vec_unroller::template run(eval,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { - enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment - }; - - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner(outer, inner); + enum { + PacketSize = unpacket_traits::size, + index = Start * PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment + }; + return eval.template packetByOuterInner(outer, inner); } }; @@ -176,53 +182,65 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < xpr.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : public redux_novec_unroller -{}; +template +struct redux_impl + : redux_novec_unroller +{ + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) + { + return Base::run(eval,func); + } +}; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static Scalar run(const Derived &mat, const Func& func) + template + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - const Index size = mat.size(); + const Index size = xpr.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +248,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet(alignedStart); + PacketScalar packet_res0 = eval.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,130 +283,108 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet(0,0); + PacketType packet_res = eval.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(mat, func); + res = redux_impl::run(eval, func, xpr); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename redux_traits::PacketType PacketType; enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize + PacketSize = redux_traits::PacketSize, + Size = Evaluator::SizeAtCompileTime, + VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize) }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + EIGEN_ONLY_USED_FOR_DEBUG(xpr) + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller::template run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); + res = func(res,redux_novec_unroller::run(eval,func)); return res; } else { - return redux_novec_unroller::run(mat,func); + return redux_novec_unroller::run(eval,func); } } }; // evaluator adaptor template -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } - - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - const XprType & nestedExpression() const { return m_xpr; } - -protected: - internal::evaluator m_evaluator; - const XprType &m_xpr; }; } // end namespace internal @@ -403,39 +399,53 @@ class redux_evaluator * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * + * \warning the matrix must be not empty, otherwise an assertion is triggered. + * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template template -typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - - return internal::redux_impl::run(thisEval, func); + + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { - return derived().redux(Eigen::internal::scalar_min_op()); + return derived().redux(Eigen::internal::scalar_min_op()); } -/** \returns the maximum of all coefficients of \c *this. - * \warning the result is undefined if \c *this contains NaN. +/** \returns the maximum of all coefficients of \c *this. + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { - return derived().redux(Eigen::internal::scalar_max_op()); + return derived().redux(Eigen::internal::scalar_max_op()); } /** \returns the sum of all coefficients of \c *this @@ -445,7 +455,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +468,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +489,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +504,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); diff --git a/externals/eigen/Eigen/src/Core/Ref.h b/externals/eigen/Eigen/src/Core/Ref.h index bdf24f52..c2a37ead 100644 --- a/externals/eigen/Eigen/src/Core/Ref.h +++ b/externals/eigen/Eigen/src/Core/Ref.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REF_H #define EIGEN_REF_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -28,12 +28,13 @@ struct traits > template struct match { enum { + IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime, HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), - OuterStrideMatch = Derived::IsVectorAtCompileTime + OuterStrideMatch = IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), // NOTE, this indirection of evaluator::Alignment is needed // to workaround a very strange bug in MSVC related to the instantiation @@ -47,7 +48,7 @@ struct traits > }; typedef typename internal::conditional::type type; }; - + }; template @@ -66,12 +67,12 @@ template class RefBase typedef MapBase Base; EIGEN_DENSE_PUBLIC_INTERFACE(RefBase) - EIGEN_DEVICE_FUNC inline Index innerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index innerStride() const { return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; } - EIGEN_DEVICE_FUNC inline Index outerStride() const + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() : IsVectorAtCompileTime ? this->size() @@ -85,34 +86,122 @@ template class RefBase m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime) {} - + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase) protected: typedef Stride StrideBase; + // Resolves inner stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveInnerStride(Index inner) { + return inner == 0 ? 1 : inner; + } + + // Resolves outer stride if default 0. + static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols, bool isVectorAtCompileTime, bool isRowMajor) { + return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer; + } + + // Returns true if construction is valid, false if there is a stride mismatch, + // and fails if there is a size mismatch. template - EIGEN_DEVICE_FUNC void construct(Expression& expr) + EIGEN_DEVICE_FUNC bool construct(Expression& expr) { + // Check matrix sizes. If this is a compile-time vector, we do allow + // implicitly transposing. + EIGEN_STATIC_ASSERT( + EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression) + // If it is a vector, the transpose sizes might match. + || ( PlainObjectType::IsVectorAtCompileTime + && ((int(PlainObjectType::RowsAtCompileTime)==Eigen::Dynamic + || int(Expression::ColsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::RowsAtCompileTime)==int(Expression::ColsAtCompileTime)) + && (int(PlainObjectType::ColsAtCompileTime)==Eigen::Dynamic + || int(Expression::RowsAtCompileTime)==Eigen::Dynamic + || int(PlainObjectType::ColsAtCompileTime)==int(Expression::RowsAtCompileTime)))), + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES + ) + + // Determine runtime rows and columns. + Index rows = expr.rows(); + Index cols = expr.cols(); if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), 1, expr.size()); + rows = 1; + cols = expr.size(); } else if(PlainObjectType::ColsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); - ::new (static_cast(this)) Base(expr.data(), expr.size(), 1); + rows = expr.size(); + cols = 1; + } + // Verify that the sizes are valid. + eigen_assert( + (PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows)); + eigen_assert( + (PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols)); + + + // If this is a vector, we might be transposing, which means that stride should swap. + const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows()); + // If the storage format differs, we also need to swap the stride. + const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0; + const bool expr_row_major = (Expression::Flags&RowMajorBit) != 0; + const bool storage_differs = (row_major != expr_row_major); + + const bool swap_stride = (transpose != storage_differs); + + // Determine expr's actual strides, resolving any defaults if zero. + const Index expr_inner_actual = resolveInnerStride(expr.innerStride()); + const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, + expr.outerStride(), + expr.rows(), + expr.cols(), + Expression::IsVectorAtCompileTime != 0, + expr_row_major); + + // If this is a column-major row vector or row-major column vector, the inner-stride + // is arbitrary, so set it to either the compile-time inner stride or 1. + const bool row_vector = (rows == 1); + const bool col_vector = (cols == 1); + const Index inner_stride = + ( (!row_major && row_vector) || (row_major && col_vector) ) ? + ( StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1) + : swap_stride ? expr_outer_actual : expr_inner_actual; + + // If this is a column-major column vector or row-major row vector, the outer-stride + // is arbitrary, so set it to either the compile-time outer stride or vector size. + const Index outer_stride = + ( (!row_major && col_vector) || (row_major && row_vector) ) ? + ( StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime) : rows * cols * inner_stride) + : swap_stride ? expr_inner_actual : expr_outer_actual; + + // Check if given inner/outer strides are compatible with compile-time strides. + const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic) + || (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride); + if (!inner_valid) { + return false; } - else - ::new (static_cast(this)) Base(expr.data(), expr.rows(), expr.cols()); - - if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit))) - ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1); - else - ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(), - StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride()); + + const bool outer_valid = (StrideType::OuterStrideAtCompileTime == Dynamic) + || (resolveOuterStride( + inner_stride, + Index(StrideType::OuterStrideAtCompileTime), + rows, cols, PlainObjectType::IsVectorAtCompileTime != 0, + row_major) + == outer_stride); + if (!outer_valid) { + return false; + } + + ::new (static_cast(this)) Base(expr.data(), rows, cols); + ::new (&m_stride) StrideBase( + (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride, + (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride ); + return true; } StrideBase m_stride; @@ -184,6 +273,8 @@ template class RefBase * void foo(const Ref >& A) { foo_impl(A); } * \endcode * + * See also the following stackoverflow questions for further references: + * - Correct usage of the Eigen::Ref<> class * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ @@ -207,7 +298,10 @@ template class Ref typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); - Base::construct(expr.derived()); + // Construction must pass since we will not create temprary storage in the non-const case. + const bool success = Base::construct(expr.derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } template EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, @@ -221,7 +315,10 @@ template class Ref EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); - Base::construct(expr.const_cast_derived()); + // Construction must pass since we will not create temporary storage in the non-const case. + const bool success = Base::construct(expr.const_cast_derived()); + EIGEN_UNUSED_VARIABLE(success) + eigen_assert(success); } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref) @@ -262,7 +359,10 @@ template class Ref< template EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type) { - Base::construct(expr); + // Check if we can use the underlying expr's storage directly, otherwise call the copy version. + if (!Base::construct(expr)) { + construct(expr, internal::false_type()); + } } template diff --git a/externals/eigen/Eigen/src/Core/Replicate.h b/externals/eigen/Eigen/src/Core/Replicate.h index 9960ef88..ab5be7e6 100644 --- a/externals/eigen/Eigen/src/Core/Replicate.h +++ b/externals/eigen/Eigen/src/Core/Replicate.h @@ -10,7 +10,7 @@ #ifndef EIGEN_REPLICATE_H #define EIGEN_REPLICATE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -35,7 +35,7 @@ struct traits > IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1 : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0 : (MatrixType::Flags & RowMajorBit) ? 1 : 0, - + // FIXME enable DirectAccess with negative strides? Flags = IsRowMajor ? RowMajorBit : 0 }; @@ -88,15 +88,15 @@ template class Replicate THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } EIGEN_DEVICE_FUNC const _MatrixTypeNested& nestedExpression() const - { - return m_matrix; + { + return m_matrix; } protected: @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/externals/eigen/Eigen/src/Core/Reshaped.h b/externals/eigen/Eigen/src/Core/Reshaped.h new file mode 100644 index 00000000..52de73b6 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/Reshaped.h @@ -0,0 +1,454 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2017 Gael Guennebaud +// Copyright (C) 2014 yoco +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RESHAPED_H +#define EIGEN_RESHAPED_H + +namespace Eigen { + +/** \class Reshaped + * \ingroup Core_Module + * + * \brief Expression of a fixed-size or dynamic-size reshape + * + * \tparam XprType the type of the expression in which we are taking a reshape + * \tparam Rows the number of rows of the reshape we are taking at compile time (optional) + * \tparam Cols the number of columns of the reshape we are taking at compile time (optional) + * \tparam Order can be ColMajor or RowMajor, default is ColMajor. + * + * This class represents an expression of either a fixed-size or dynamic-size reshape. + * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and + * most of the time this is the only way it is used. + * + * However, in C++98, if you want to directly maniputate reshaped expressions, + * for instance if you want to write a function returning such an expression, you + * will need to use this class. In C++11, it is advised to use the \em auto + * keyword for such use cases. + * + * Here is an example illustrating the dynamic case: + * \include class_Reshaped.cpp + * Output: \verbinclude class_Reshaped.out + * + * Here is an example illustrating the fixed-size case: + * \include class_FixedReshaped.cpp + * Output: \verbinclude class_FixedReshaped.out + * + * \sa DenseBase::reshaped(NRowsType,NColsType) + */ + +namespace internal { + +template +struct traits > : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + enum{ + MatrixRows = traits::RowsAtCompileTime, + MatrixCols = traits::ColsAtCompileTime, + RowsAtCompileTime = Rows, + ColsAtCompileTime = Cols, + MaxRowsAtCompileTime = Rows, + MaxColsAtCompileTime = Cols, + XpxStorageOrder = ((int(traits::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor, + ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor + : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor + : XpxStorageOrder, + HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder), + InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = HasSameStorageOrderAsXprType + ? int(inner_stride_at_compile_time::ret) + : Dynamic, + OuterStrideAtCompileTime = Dynamic, + + HasDirectAccess = internal::has_direct_access::ret + && (Order==int(XpxStorageOrder)) + && ((evaluator::Flags&LinearAccessBit)==LinearAccessBit), + + MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits::size) == 0) + && (InnerStrideAtCompileTime == 1) + ? PacketAccessBit : 0, + //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit), + + Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit) + }; +}; + +template class ReshapedImpl_dense; + +} // end namespace internal + +template class ReshapedImpl; + +template class Reshaped + : public ReshapedImpl::StorageKind> +{ + typedef ReshapedImpl::StorageKind> Impl; + public: + //typedef typename Impl::Base Base; + typedef Impl Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr) + : Impl(xpr) + { + EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) + eigen_assert(Rows * Cols == xpr.rows() * xpr.cols()); + } + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline Reshaped(XprType& xpr, + Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) + { + eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols)); + eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols()); + } +}; + +// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense +// that must be specialized for direct and non-direct access... +template +class ReshapedImpl + : public internal::ReshapedImpl_dense >::HasDirectAccess> +{ + typedef internal::ReshapedImpl_dense >::HasDirectAccess> Impl; + public: + typedef Impl Base; + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl) + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {} + EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols) + : Impl(xpr, reshapeRows, reshapeCols) {} +}; + +namespace internal { + +/** \internal Internal implementation of dense Reshaped in the general case. */ +template +class ReshapedImpl_dense + : public internal::dense_xpr_base >::type +{ + typedef Reshaped ReshapedType; + public: + + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename internal::remove_all::type NestedExpression; + + class InnerIterator; + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : m_xpr(xpr), m_rows(Rows), m_cols(Cols) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : m_xpr(xpr), m_rows(nRows), m_cols(nCols) + {} + + EIGEN_DEVICE_FUNC Index rows() const { return m_rows; } + EIGEN_DEVICE_FUNC Index cols() const { return m_cols; } + + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** \sa MapBase::data() */ + EIGEN_DEVICE_FUNC inline const Scalar* data() const; + EIGEN_DEVICE_FUNC inline Index innerStride() const; + EIGEN_DEVICE_FUNC inline Index outerStride() const; + #endif + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + typename internal::remove_reference::type& + nestedExpression() { return m_xpr; } + + protected: + + MatrixTypeNested m_xpr; + const internal::variable_if_dynamic m_rows; + const internal::variable_if_dynamic m_cols; +}; + + +/** \internal Internal implementation of dense Reshaped in the direct access case. */ +template +class ReshapedImpl_dense + : public MapBase > +{ + typedef Reshaped ReshapedType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; + public: + + typedef MapBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) + + /** Fixed-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr) + : Base(xpr.data()), m_xpr(xpr) + {} + + /** Dynamic-size constructor + */ + EIGEN_DEVICE_FUNC + inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) + : Base(xpr.data(), nRows, nCols), + m_xpr(xpr) + {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const + { + return m_xpr; + } + + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } + + /** \sa MapBase::innerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const + { + return m_xpr.innerStride(); + } + + /** \sa MapBase::outerStride() */ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const + { + return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + } + + protected: + + XprTypeNested m_xpr; +}; + +// Evaluators +template struct reshaped_evaluator; + +template +struct evaluator > + : reshaped_evaluator >::HasDirectAccess> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + // TODO: should check for smaller packet types + typedef typename packet_traits::type PacketScalar; + + enum { + CoeffReadCost = evaluator::CoeffReadCost, + HasDirectAccess = traits::HasDirectAccess, + +// RowsAtCompileTime = traits::RowsAtCompileTime, +// ColsAtCompileTime = traits::ColsAtCompileTime, +// MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, +// MaxColsAtCompileTime = traits::MaxColsAtCompileTime, +// +// InnerStrideAtCompileTime = traits::HasSameStorageOrderAsXprType +// ? int(inner_stride_at_compile_time::ret) +// : Dynamic, +// OuterStrideAtCompileTime = Dynamic, + + FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0, + FlagsRowMajorBit = (traits::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, + FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, + Flags0 = evaluator::Flags & (HereditaryBits & ~RowMajorBit), + Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit, + + PacketAlignment = unpacket_traits::alignment, + Alignment = evaluator::Alignment + }; + typedef reshaped_evaluator reshaped_evaluator_type; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } +}; + +template +struct reshaped_evaluator + : evaluator_base > +{ + typedef Reshaped XprType; + + enum { + CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of index computations */, + + Flags = (evaluator::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)), + + Alignment = 0 + }; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef std::pair RowCol; + + inline RowCol index_remap(Index rowId, Index colId) const + { + if(Order==ColMajor) + { + const Index nth_elem_idx = colId * m_xpr.rows() + rowId; + return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), + nth_elem_idx / m_xpr.nestedExpression().rows()); + } + else + { + const Index nth_elem_idx = colId + rowId * m_xpr.cols(); + return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(), + nth_elem_idx % m_xpr.nestedExpression().cols()); + } + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index rowId, Index colId) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.coeff(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + + } + + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeffRef(row_col.first, row_col.second); + } + + EIGEN_DEVICE_FUNC + inline const CoeffReturnType coeff(Index index) const + { + const RowCol row_col = index_remap(Rows == 1 ? 0 : index, + Rows == 1 ? index : 0); + return m_argImpl.coeff(row_col.first, row_col.second); + } +#if 0 + EIGEN_DEVICE_FUNC + template + inline PacketScalar packet(Index rowId, Index colId) const + { + const RowCol row_col = index_remap(rowId, colId); + return m_argImpl.template packet(row_col.first, row_col.second); + + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + { + const RowCol row_col = index_remap(rowId, colId); + m_argImpl.const_cast_derived().template writePacket + (row_col.first, row_col.second, val); + } + + template + EIGEN_DEVICE_FUNC + inline PacketScalar packet(Index index) const + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second); + } + + template + EIGEN_DEVICE_FUNC + inline void writePacket(Index index, const PacketScalar& val) + { + const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + return m_argImpl.template packet(row_col.first, row_col.second, val); + } +#endif +protected: + + evaluator m_argImpl; + const XprType& m_xpr; + +}; + +template +struct reshaped_evaluator +: mapbase_evaluator, + typename Reshaped::PlainObject> +{ + typedef Reshaped XprType; + typedef typename XprType::Scalar Scalar; + + EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) + : mapbase_evaluator(xpr) + { + // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime + eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_H diff --git a/externals/eigen/Eigen/src/Core/ReturnByValue.h b/externals/eigen/Eigen/src/Core/ReturnByValue.h index c44b7673..4dad13ea 100644 --- a/externals/eigen/Eigen/src/Core/ReturnByValue.h +++ b/externals/eigen/Eigen/src/Core/ReturnByValue.h @@ -60,8 +60,10 @@ template class ReturnByValue EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { static_cast(this)->evalTo(dst); } - EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast(this)->rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast(this)->cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return static_cast(this)->rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return static_cast(this)->cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT @@ -79,7 +81,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); @@ -90,7 +92,7 @@ namespace internal { // Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that // when a ReturnByValue expression is assigned, the evaluator is not constructed. // TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world - + template struct evaluator > : public evaluator::ReturnType> @@ -98,7 +100,7 @@ struct evaluator > typedef ReturnByValue XprType; typedef typename internal::traits::ReturnType PlainObject; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) { diff --git a/externals/eigen/Eigen/src/Core/Reverse.h b/externals/eigen/Eigen/src/Core/Reverse.h index 0640cda2..28cdd76a 100644 --- a/externals/eigen/Eigen/src/Core/Reverse.h +++ b/externals/eigen/Eigen/src/Core/Reverse.h @@ -12,7 +12,7 @@ #ifndef EIGEN_REVERSE_H #define EIGEN_REVERSE_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -44,7 +44,7 @@ template struct reverse_packet_cond static inline PacketType run(const PacketType& x) { return x; } }; -} // end namespace internal +} // end namespace internal /** \class Reverse * \ingroup Core_Module @@ -89,8 +89,10 @@ template class Reverse EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } EIGEN_DEVICE_FUNC inline Index innerStride() const { @@ -98,7 +100,7 @@ template class Reverse } EIGEN_DEVICE_FUNC const typename internal::remove_all::type& - nestedExpression() const + nestedExpression() const { return m_matrix; } @@ -114,7 +116,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +138,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -161,7 +163,7 @@ inline void DenseBase::reverseInPlace() } namespace internal { - + template struct vectorwise_reverse_inplace_impl; @@ -171,8 +173,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); + xpr.topRows(fix(half)) + .swap(xpr.bottomRows(fix(half)).colwise().reverse()); } }; @@ -182,8 +186,10 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { + const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); + xpr.leftCols(fix(half)) + .swap(xpr.rightCols(fix(half)).rowwise().reverse()); } }; @@ -201,9 +207,9 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); + internal::vectorwise_reverse_inplace_impl::run(m_matrix); } } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/Select.h b/externals/eigen/Eigen/src/Core/Select.h index 79eec1b5..7c86bf87 100644 --- a/externals/eigen/Eigen/src/Core/Select.h +++ b/externals/eigen/Eigen/src/Core/Select.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELECT_H #define EIGEN_SELECT_H -namespace Eigen { +namespace Eigen { /** \class Select * \ingroup Core_Module @@ -67,8 +67,10 @@ class Select : public internal::dense_xpr_base< Select template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const DenseBase& elseMatrix) const { @@ -134,7 +136,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const { @@ -149,7 +151,7 @@ DenseBase::select(const DenseBase& thenMatrix, */ template template -inline const Select +inline EIGEN_DEVICE_FUNC const Select DenseBase::select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const { diff --git a/externals/eigen/Eigen/src/Core/SelfAdjointView.h b/externals/eigen/Eigen/src/Core/SelfAdjointView.h index 504c98f0..8ce3b372 100644 --- a/externals/eigen/Eigen/src/Core/SelfAdjointView.h +++ b/externals/eigen/Eigen/src/Core/SelfAdjointView.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SELFADJOINTMATRIX_H #define EIGEN_SELFADJOINTMATRIX_H -namespace Eigen { +namespace Eigen { /** \class SelfAdjointView * \ingroup Core_Module @@ -58,29 +58,32 @@ template class SelfAdjointView typedef MatrixTypeNestedCleaned NestedExpression; /** \brief The type of coefficients in this matrix */ - typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all::type MatrixConjugateReturnType; + typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, Flags = internal::traits::Flags, - TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0) + TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0) }; typedef typename MatrixType::PlainObject PlainObject; EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - {} + { + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); + } - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return m_matrix.outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return m_matrix.innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return m_matrix.outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return m_matrix.innerStride(); } /** \sa MatrixBase::coeff() * \warning the coordinates must fit into the referenced triangular part @@ -129,7 +132,7 @@ template class SelfAdjointView { return Product(lhs.derived(),rhs); } - + friend EIGEN_DEVICE_FUNC const SelfAdjointView operator*(const Scalar& s, const SelfAdjointView& mat) @@ -189,12 +192,24 @@ template class SelfAdjointView TriangularView >::type(tmp2); } - typedef SelfAdjointView ConjugateReturnType; + typedef SelfAdjointView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef SelfAdjointView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -285,17 +300,17 @@ class triangular_dense_assignment_kernel template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -339,7 +354,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/externals/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h b/externals/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h index 719ed72a..7c89c2e2 100644 --- a/externals/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/externals/eigen/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -15,33 +15,29 @@ namespace Eigen { // TODO generalize the scalar type of 'other' template -EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); return derived(); } diff --git a/externals/eigen/Eigen/src/Core/Solve.h b/externals/eigen/Eigen/src/Core/Solve.h index 960a5859..23d5cb70 100644 --- a/externals/eigen/Eigen/src/Core/Solve.h +++ b/externals/eigen/Eigen/src/Core/Solve.h @@ -13,13 +13,13 @@ namespace Eigen { template class SolveImpl; - + /** \class Solve * \ingroup Core_Module * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposion object + * \tparam Decomposition the type of the matrix or decomposition object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) @@ -34,12 +34,12 @@ template struct s template struct solve_traits { - typedef Matrix PlainObject; + RhsType::MaxColsAtCompileTime>::type PlainObject; }; template @@ -64,13 +64,13 @@ class Solve : public SolveImpl::PlainObject PlainObject; typedef typename internal::traits::StorageIndex StorageIndex; - + Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; } @@ -87,14 +87,14 @@ class SolveImpl : public MatrixBase > { typedef Solve Derived; - + public: - + typedef MatrixBase > Base; EIGEN_DENSE_PUBLIC_INTERFACE(Derived) private: - + Scalar coeff(Index row, Index col) const; Scalar coeff(Index i) const; }; @@ -119,15 +119,15 @@ struct evaluator > typedef evaluator Base; enum { Flags = Base::Flags | EvalBeforeNestingBit }; - + EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve) : m_result(solve.rows(), solve.cols()) { ::new (static_cast(this)) Base(m_result); solve.dec()._solve_impl(solve.rhs(), m_result); } - -protected: + +protected: PlainObject m_result; }; @@ -176,12 +176,12 @@ struct Assignment(src.rhs(), dst); } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/SolveTriangular.h b/externals/eigen/Eigen/src/Core/SolveTriangular.h index 049890b2..dfbf9952 100644 --- a/externals/eigen/Eigen/src/Core/SolveTriangular.h +++ b/externals/eigen/Eigen/src/Core/SolveTriangular.h @@ -10,7 +10,7 @@ #ifndef EIGEN_SOLVETRIANGULAR_H #define EIGEN_SOLVETRIANGULAR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -19,7 +19,7 @@ namespace internal { template struct triangular_solve_vector; -template +template struct triangular_solve_matrix; // small helper struct extracting some traits on the underlying solver operation @@ -54,7 +54,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::ExtractType ActualLhsType; typedef Map, Aligned> MappedRhs; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { ActualLhsType actualLhs = LhsProductTraits::extract(lhs); @@ -64,7 +64,7 @@ struct triangular_solver_selector ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(), (useRhsDirectly ? rhs.data() : 0)); - + if(!useRhsDirectly) MappedRhs(actualRhs,rhs.size()) = rhs; @@ -85,7 +85,7 @@ struct triangular_solver_selector typedef blas_traits LhsProductTraits; typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { typename internal::add_const_on_value_type::type actualLhs = LhsProductTraits::extract(lhs); @@ -98,8 +98,8 @@ struct triangular_solver_selector BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix - ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking); + (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime> + ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking); } }; @@ -118,7 +118,7 @@ struct triangular_solver_unroller { DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1, StartIndex = IsLower ? 0 : DiagIndex+1 }; - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { if (LoopIndex>0) rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment(StartIndex).transpose() @@ -133,22 +133,22 @@ struct triangular_solver_unroller { template struct triangular_solver_unroller { - static void run(const Lhs&, Rhs&) {} + static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {} }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { triangular_solver_unroller::run(lhs,rhs); } }; template struct triangular_solver_selector { - static void run(const Lhs& lhs, Rhs& rhs) + static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) { Transpose trLhs(lhs); Transpose trRhs(rhs); - + triangular_solver_unroller,Transpose, ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag), 0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs); @@ -164,11 +164,14 @@ struct triangular_solver_selector { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); - eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); + eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower)))); + // If solving for a 0x0 matrix, nothing to do, simply return. + if (derived().cols() == 0) + return; enum { copy = (internal::traits::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1}; typedef typename internal::conditional struct triangular_solv : m_triangularMatrix(tri), m_rhs(rhs) {} - inline Index rows() const { return m_rhs.rows(); } - inline Index cols() const { return m_rhs.cols(); } + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_rhs.rows(); } + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template inline void evalTo(Dest& dst) const { diff --git a/externals/eigen/Eigen/src/Core/SolverBase.h b/externals/eigen/Eigen/src/Core/SolverBase.h index 8a4adc22..50146104 100644 --- a/externals/eigen/Eigen/src/Core/SolverBase.h +++ b/externals/eigen/Eigen/src/Core/SolverBase.h @@ -14,8 +14,35 @@ namespace Eigen { namespace internal { +template +struct solve_assertion { + template + static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion(b); } +}; + +template +struct solve_assertion > +{ + typedef Transpose type; + + template + static void run(const type& transpose, const Rhs& b) + { + internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); + } +}; +template +struct solve_assertion, const Transpose > > +{ + typedef CwiseUnaryOp, const Transpose > type; + template + static void run(const type& adjoint, const Rhs& b) + { + internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); + } +}; } // end namespace internal /** \class SolverBase @@ -35,7 +62,7 @@ namespace internal { * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU + * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase */ template class SolverBase : public EigenBase @@ -46,6 +73,9 @@ class SolverBase : public EigenBase typedef typename internal::traits::Scalar Scalar; typedef Scalar CoeffReturnType; + template + friend struct internal::solve_assertion; + enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, @@ -56,7 +86,8 @@ class SolverBase : public EigenBase MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 }; /** Default constructor */ @@ -74,7 +105,7 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); + internal::solve_assertion::type>::template run(derived(), b); return Solve(derived(), b.derived()); } @@ -112,6 +143,13 @@ class SolverBase : public EigenBase } protected: + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(derived().m_isInitialized && "Solver is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); + } }; namespace internal { diff --git a/externals/eigen/Eigen/src/Core/StableNorm.h b/externals/eigen/Eigen/src/Core/StableNorm.h index d2fe1e19..4a3f0cca 100644 --- a/externals/eigen/Eigen/src/Core/StableNorm.h +++ b/externals/eigen/Eigen/src/Core/StableNorm.h @@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } +template +void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) +{ + typedef typename VectorType::Scalar Scalar; + const Index blockSize = 4096; + + typedef typename internal::nested_eval::type VectorTypeCopy; + typedef typename internal::remove_all::type VectorTypeCopyClean; + const VectorTypeCopy copy(vec); + + enum { + CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = vec.size(); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi +typename VectorType::RealScalar +stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +{ + using std::sqrt; + using std::abs; + + Index n = vec.size(); + + if(n==1) + return abs(vec.coeff(0)); + + typedef typename VectorType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + stable_norm_impl_inner_step(vec, ssq, scale, invScale); + + return scale * sqrt(ssq); +} + +template +typename MatrixType::RealScalar +stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +{ + using std::sqrt; + + typedef typename MatrixType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -58,52 +123,43 @@ blueNorm_impl(const EigenBase& _vec) using std::pow; using std::sqrt; using std::abs; + + // This program calculates the machine-dependent constants + // bl, b2, slm, s2m, relerr overfl + // from the "basic" machine-dependent numbers + // nbig, ibeta, it, iemin, iemax, rbig. + // The following define the basic machine-dependent constants. + // For portability, the PORT subprograms "ilmaeh" and "rlmach" + // are used. For any specific computer, each of the assignment + // statements can be replaced + static const int ibeta = std::numeric_limits::radix; // base for floating-point numbers + static const int it = NumTraits::digits(); // number of base-beta digits in mantissa + static const int iemin = NumTraits::min_exponent(); // minimum exponent + static const int iemax = NumTraits::max_exponent(); // maximum exponent + static const RealScalar rbig = NumTraits::highest(); // largest floating-point number + static const RealScalar b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(-((1-iemin)/2)))); // lower boundary of midrange + static const RealScalar b2 = RealScalar(pow(RealScalar(ibeta),RealScalar((iemax + 1 - it)/2))); // upper boundary of midrange + static const RealScalar s1m = RealScalar(pow(RealScalar(ibeta),RealScalar((2-iemin)/2))); // scaling factor for lower range + static const RealScalar s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(- ((iemax+it)/2)))); // scaling factor for upper range + static const RealScalar eps = RealScalar(pow(double(ibeta), 1-it)); + static const RealScalar relerr = sqrt(eps); // tolerance for neglecting asml + const Derived& vec(_vec.derived()); - static bool initialized = false; - static RealScalar b1, b2, s1m, s2m, rbig, relerr; - if(!initialized) - { - int ibeta, it, iemin, iemax, iexp; - RealScalar eps; - // This program calculates the machine-dependent constants - // bl, b2, slm, s2m, relerr overfl - // from the "basic" machine-dependent numbers - // nbig, ibeta, it, iemin, iemax, rbig. - // The following define the basic machine-dependent constants. - // For portability, the PORT subprograms "ilmaeh" and "rlmach" - // are used. For any specific computer, each of the assignment - // statements can be replaced - ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = std::numeric_limits::digits; // number of base-beta digits in mantissa - iemin = std::numeric_limits::min_exponent; // minimum exponent - iemax = std::numeric_limits::max_exponent; // maximum exponent - rbig = (std::numeric_limits::max)(); // largest floating-point number - - iexp = -((1-iemin)/2); - b1 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // lower boundary of midrange - iexp = (iemax + 1 - it)/2; - b2 = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // upper boundary of midrange - - iexp = (2-iemin)/2; - s1m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for lower range - iexp = - ((iemax+it)/2); - s2m = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp))); // scaling factor for upper range - - eps = RealScalar(pow(double(ibeta), 1-it)); - relerr = sqrt(eps); // tolerance for neglecting asml - initialized = true; - } Index n = vec.size(); RealScalar ab2 = b2 / RealScalar(n); RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - for(typename Derived::InnerIterator it(vec, 0); it; ++it) + + for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); + for(typename Derived::InnerIterator iter(vec, j); iter; ++iter) + { + RealScalar ax = abs(iter.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); + } } if(amed!=amed) return amed; // we got a NaN @@ -156,35 +212,7 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - using std::sqrt; - using std::abs; - const Index blockSize = 4096; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of square - - typedef typename internal::nested_eval::type DerivedCopy; - typedef typename internal::remove_all::type DerivedCopyClean; - DerivedCopy copy(derived()); - - enum { - CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*2, internal::evaluator::Alignment>, - typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = size(); - - if(n==1) - return abs(this->coeff(0)); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + if(size()==1) + return numext::abs(coeff(0,0)); + else + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/StlIterators.h b/externals/eigen/Eigen/src/Core/StlIterators.h new file mode 100644 index 00000000..09041db1 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/StlIterators.h @@ -0,0 +1,463 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_STLITERATORS_H +#define EIGEN_STLITERATORS_H + +namespace Eigen { + +namespace internal { + +template +struct indexed_based_stl_iterator_traits; + +template +class indexed_based_stl_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_iterator_base non_const_iterator; + typedef indexed_based_stl_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_iterator_base; + friend class indexed_based_stl_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_iterator_base() EIGEN_NO_THROW : mp_xpr(0), m_index(0) {} + indexed_based_stl_iterator_base(XprType& xpr, Index index) EIGEN_NO_THROW : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_iterator_base(const non_const_iterator& other) EIGEN_NO_THROW + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { ++m_index; return derived(); } + Derived& operator--() { --m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index += b; return derived(); } + Derived& operator-=(Index b) { m_index -= b; return derived(); } + + difference_type operator-(const indexed_based_stl_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return m_index - other.m_index; + } + + bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class indexed_based_stl_reverse_iterator_base +{ +protected: + typedef indexed_based_stl_iterator_traits traits; + typedef typename traits::XprType XprType; + typedef indexed_based_stl_reverse_iterator_base non_const_iterator; + typedef indexed_based_stl_reverse_iterator_base const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class indexed_based_stl_reverse_iterator_base; + friend class indexed_based_stl_reverse_iterator_base; +public: + typedef Index difference_type; + typedef std::random_access_iterator_tag iterator_category; + + indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {} + indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {} + + indexed_based_stl_reverse_iterator_base(const non_const_iterator& other) + : mp_xpr(other.mp_xpr), m_index(other.m_index) + {} + + indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other) + { + mp_xpr = other.mp_xpr; + m_index = other.m_index; + return *this; + } + + Derived& operator++() { --m_index; return derived(); } + Derived& operator--() { ++m_index; return derived(); } + + Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} + Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} + + friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } + friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } + friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } + friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } + + Derived& operator+=(Index b) { m_index -= b; return derived(); } + Derived& operator-=(Index b) { m_index += b; return derived(); } + + difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + difference_type operator-(const other_iterator& other) const + { + eigen_assert(mp_xpr == other.mp_xpr); + return other.m_index - m_index; + } + + bool operator==(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + + bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } + bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } + bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } + bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } + bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } + bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } + +protected: + + Derived& derived() { return static_cast(*this); } + const Derived& derived() const { return static_cast(*this); } + + XprType *mp_xpr; + Index m_index; +}; + +template +class pointer_based_stl_iterator +{ + enum { is_lvalue = internal::is_lvalue::value }; + typedef pointer_based_stl_iterator::type> non_const_iterator; + typedef pointer_based_stl_iterator::type> const_iterator; + typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; + // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: + friend class pointer_based_stl_iterator::type>; + friend class pointer_based_stl_iterator::type>; +public: + typedef Index difference_type; + typedef typename XprType::Scalar value_type; + typedef std::random_access_iterator_tag iterator_category; + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + + pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} + pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) + { + m_ptr = xpr.data() + index * m_incr.value(); + } + + pointer_based_stl_iterator(const non_const_iterator& other) EIGEN_NO_THROW + : m_ptr(other.m_ptr), m_incr(other.m_incr) + {} + + pointer_based_stl_iterator& operator=(const non_const_iterator& other) EIGEN_NO_THROW + { + m_ptr = other.m_ptr; + m_incr.setValue(other.m_incr); + return *this; + } + + reference operator*() const { return *m_ptr; } + reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); } + pointer operator->() const { return m_ptr; } + + pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; } + pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; } + + pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;} + pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;} + + friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; } + friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; } + friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; } + friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; } + + pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; } + pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; } + + difference_type operator-(const pointer_based_stl_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + difference_type operator-(const other_iterator& other) const { + return (m_ptr - other.m_ptr)/m_incr.value(); + } + + bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; } + + bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; } + bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; } + bool operator< (const other_iterator& other) const { return m_ptr < other.m_ptr; } + bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; } + bool operator> (const other_iterator& other) const { return m_ptr > other.m_ptr; } + bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; } + +protected: + + pointer m_ptr; + internal::variable_if_dynamic m_incr; +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef generic_randaccess_stl_iterator::type> non_const_iterator; + typedef generic_randaccess_stl_iterator::type> const_iterator; +}; + +template +class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base > +{ +public: + typedef typename XprType::Scalar value_type; + +protected: + + enum { + has_direct_access = (internal::traits::Flags & DirectAccessBit) ? 1 : 0, + is_lvalue = internal::is_lvalue::value + }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + // TODO currently const Transpose/Reshape expressions never returns const references, + // so lets return by value too. + //typedef typename internal::conditional::type read_only_ref_t; + typedef const value_type read_only_ref_t; + +public: + + typedef typename internal::conditional::type pointer; + typedef typename internal::conditional::type reference; + + generic_randaccess_stl_iterator() : Base() {} + generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {} + using Base::operator=; + + reference operator*() const { return (*mp_xpr)(m_index); } + reference operator[](Index i) const { return (*mp_xpr)(m_index+i); } + pointer operator->() const { return &((*mp_xpr)(m_index)); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_iterator : public indexed_based_stl_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_iterator_ptr + { + public: + subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_iterator_ptr pointer; + + subvector_stl_iterator() : Base() {} + subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +template +struct indexed_based_stl_iterator_traits > +{ + typedef _XprType XprType; + typedef subvector_stl_reverse_iterator::type, Direction> non_const_iterator; + typedef subvector_stl_reverse_iterator::type, Direction> const_iterator; +}; + +template +class subvector_stl_reverse_iterator : public indexed_based_stl_reverse_iterator_base > +{ +protected: + + enum { is_lvalue = internal::is_lvalue::value }; + + typedef indexed_based_stl_reverse_iterator_base Base; + using Base::m_index; + using Base::mp_xpr; + + typedef typename internal::conditional::type SubVectorType; + typedef typename internal::conditional::type ConstSubVectorType; + + +public: + typedef typename internal::conditional::type reference; + typedef typename reference::PlainObject value_type; + +private: + class subvector_stl_reverse_iterator_ptr + { + public: + subvector_stl_reverse_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} + reference* operator->() { return &m_subvector; } + private: + reference m_subvector; + }; +public: + + typedef subvector_stl_reverse_iterator_ptr pointer; + + subvector_stl_reverse_iterator() : Base() {} + subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr,index) {} + + reference operator*() const { return (*mp_xpr).template subVector(m_index); } + reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } + pointer operator->() const { return (*mp_xpr).template subVector(m_index); } +}; + +} // namespace internal + + +/** returns an iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa end(), cbegin() + */ +template +inline typename DenseBase::iterator DenseBase::begin() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), 0); +} + +/** const version of begin() */ +template +inline typename DenseBase::const_iterator DenseBase::begin() const +{ + return cbegin(); +} + +/** returns a read-only const_iterator to the first element of the 1D vector or array + * \only_for_vectors + * \sa cend(), begin() + */ +template +inline typename DenseBase::const_iterator DenseBase::cbegin() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), 0); +} + +/** returns an iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::iterator DenseBase::end() +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return iterator(derived(), size()); +} + +/** const version of end() */ +template +inline typename DenseBase::const_iterator DenseBase::end() const +{ + return cend(); +} + +/** returns a read-only const_iterator to the element following the last element of the 1D vector or array + * \only_for_vectors + * \sa begin(), cend() + */ +template +inline typename DenseBase::const_iterator DenseBase::cend() const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return const_iterator(derived(), size()); +} + +} // namespace Eigen + +#endif // EIGEN_STLITERATORS_H diff --git a/externals/eigen/Eigen/src/Core/Stride.h b/externals/eigen/Eigen/src/Core/Stride.h index 513742f3..6494d514 100644 --- a/externals/eigen/Eigen/src/Core/Stride.h +++ b/externals/eigen/Eigen/src/Core/Stride.h @@ -10,7 +10,7 @@ #ifndef EIGEN_STRIDE_H #define EIGEN_STRIDE_H -namespace Eigen { +namespace Eigen { /** \class Stride * \ingroup Core_Module @@ -38,6 +38,10 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * + * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were + * not allowed). + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template @@ -55,6 +59,8 @@ class Stride Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) { + // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic. + // FIXME: for Eigen 4 we should also unify this API with fix<> eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic); } @@ -63,7 +69,6 @@ class Stride Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) { - eigen_assert(innerStride>=0 && outerStride>=0); } /** Copy constructor */ @@ -73,10 +78,10 @@ class Stride {} /** \returns the outer stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outer() const { return m_outer.value(); } /** \returns the inner stride */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index inner() const { return m_inner.value(); } protected: diff --git a/externals/eigen/Eigen/src/Core/Swap.h b/externals/eigen/Eigen/src/Core/Swap.h index d7020091..180a4e5a 100644 --- a/externals/eigen/Eigen/src/Core/Swap.h +++ b/externals/eigen/Eigen/src/Core/Swap.h @@ -30,12 +30,13 @@ class generic_dense_assignment_kernel Functor; - EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} template - void assignPacket(Index row, Index col) + EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) { PacketType tmp = m_src.template packet(row,col); const_cast(m_src).template writePacket(row,col, m_dst.template packet(row,col)); @@ -43,7 +44,7 @@ class generic_dense_assignment_kernel - void assignPacket(Index index) + EIGEN_STRONG_INLINE void assignPacket(Index index) { PacketType tmp = m_src.template packet(index); const_cast(m_src).template writePacket(index, m_dst.template packet(index)); @@ -52,7 +53,7 @@ class generic_dense_assignment_kernel - void assignPacketByOuterInner(Index outer, Index inner) + EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); Index col = Base::colIndexByOuterInner(outer, inner); diff --git a/externals/eigen/Eigen/src/Core/Transpose.h b/externals/eigen/Eigen/src/Core/Transpose.h index 79b767bc..2bc658f4 100644 --- a/externals/eigen/Eigen/src/Core/Transpose.h +++ b/externals/eigen/Eigen/src/Core/Transpose.h @@ -11,7 +11,7 @@ #ifndef EIGEN_TRANSPOSE_H #define EIGEN_TRANSPOSE_H -namespace Eigen { +namespace Eigen { namespace internal { template @@ -61,24 +61,27 @@ template class Transpose typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC - explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} + explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::remove_reference::type& nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -122,8 +125,10 @@ template class TransposeImpl EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) - EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index outerStride() const { return derived().nestedExpression().outerStride(); } typedef typename internal::conditional< internal::is_lvalue::value, @@ -131,21 +136,25 @@ template class TransposeImpl const Scalar >::type ScalarWithConstIfNotLvalue; - EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } - EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar* data() const { return derived().nestedExpression().data(); } // FIXME: shall we keep the const version of coeffRef? - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index rowId, Index colId) const { return derived().nestedExpression().coeffRef(colId, rowId); } - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar& coeffRef(Index index) const { return derived().nestedExpression().coeffRef(index); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl) }; /** \returns an expression of the transpose of *this. @@ -168,7 +177,8 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -180,7 +190,8 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -206,7 +217,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -228,11 +239,10 @@ struct inplace_transpose_selector; template struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { - m.matrix().template triangularView().swap(m.matrix().transpose()); + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); } }; -// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template struct inplace_transpose_selector { // PacketSize x PacketSize static void run(MatrixType& m) { @@ -249,16 +259,66 @@ struct inplace_transpose_selector { // PacketSize x Packet } }; + +template +void BlockedInPlaceTranspose(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::packet_traits::size; + eigen_assert(m.rows() == m.cols()); + int row_start = 0; + for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) { + for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) { + PacketBlock A; + if (row_start == col_start) { + for (Index i=0; i(row_start + i,col_start); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); + } else { + PacketBlock B; + for (Index i=0; i(row_start + i,col_start); + B.packet[i] = m.template packetByOuterInner(col_start + i, row_start); + } + internal::ptranspose(A); + internal::ptranspose(B); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]); + m.template writePacket(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]); + } + } + } + } + for (Index row = row_start; row < m.rows(); ++row) { + m.matrix().row(row).head(row).swap( + m.matrix().col(row).head(row).transpose()); + } +} + template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // non square or dynamic matrix static void run(MatrixType& m) { - if (m.rows()==m.cols()) - m.matrix().template triangularView().swap(m.matrix().transpose()); - else + typedef typename MatrixType::Scalar Scalar; + if (m.rows() == m.cols()) { + const Index PacketSize = internal::packet_traits::size; + if (!NumTraits::IsComplex && m.rows() >= PacketSize) { + if ((m.rows() % PacketSize) == 0) + BlockedInPlaceTranspose::Alignment>(m); + else + BlockedInPlaceTranspose(m); + } + else { + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); + } + } else { m = m.transpose().eval(); + } } }; + } // end namespace internal /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose. @@ -276,12 +336,12 @@ struct inplace_transpose_selector { // non squ * Notice however that this method is only useful if you want to replace a matrix by its own transpose. * If you just need the transpose of a matrix, use transpose(). * - * \note if the matrix is not square, then \c *this must be a resizable matrix. + * \note if the matrix is not square, then \c *this must be a resizable matrix. * This excludes (non-square) fixed-size matrices, block-expressions and maps. * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -312,7 +372,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } @@ -391,7 +451,8 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - internal::checkTransposeAliasing_impl::run(dst, src); + if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) + internal::checkTransposeAliasing_impl::run(dst, src); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/Transpositions.h b/externals/eigen/Eigen/src/Core/Transpositions.h index 19c17bb4..38a7b01c 100644 --- a/externals/eigen/Eigen/src/Core/Transpositions.h +++ b/externals/eigen/Eigen/src/Core/Transpositions.h @@ -10,20 +10,22 @@ #ifndef EIGEN_TRANSPOSITIONS_H #define EIGEN_TRANSPOSITIONS_H -namespace Eigen { +namespace Eigen { template class TranspositionsBase { typedef internal::traits Traits; - + public: typedef typename Traits::IndicesType IndicesType; typedef typename IndicesType::Scalar StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast(this); } /** Copies the \a other transpositions into \c *this */ @@ -33,26 +35,19 @@ class TranspositionsBase indices() = other.indices(); return derived(); } - - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** This is a special case of the templated operator=. Its purpose is to - * prevent a default operator= from hiding the templated operator=. - */ - Derived& operator=(const TranspositionsBase& other) - { - indices() = other.indices(); - return derived(); - } - #endif /** \returns the number of transpositions */ + EIGEN_DEVICE_FUNC Index size() const { return indices().size(); } /** \returns the number of rows of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index rows() const { return indices().size(); } /** \returns the number of columns of the equivalent permutation matrix */ + EIGEN_DEVICE_FUNC Index cols() const { return indices().size(); } /** Direct access to the underlying index vector */ + EIGEN_DEVICE_FUNC inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); } /** Direct access to the underlying index vector */ inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); } @@ -66,8 +61,10 @@ class TranspositionsBase inline StorageIndex& operator[](Index i) { return indices()(i); } /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return derived().indices(); } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return derived().indices(); } /** Resizes to given size. */ @@ -84,7 +81,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template @@ -171,12 +168,6 @@ class Transpositions : public TranspositionsBase& other) : m_indices(other.indices()) {} - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** Standard copy constructor. Defined only to prevent a default copy constructor - * from hiding the other templated constructor */ - inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {} - #endif - /** Generic constructor from expression of the transposition indices. */ template explicit inline Transpositions(const MatrixBase& indices) : m_indices(indices) @@ -189,25 +180,16 @@ class Transpositions : public TranspositionsBase,P #endif /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } - + /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -306,21 +290,12 @@ class TranspositionsWrapper return Base::operator=(other); } - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** This is a special case of the templated operator=. Its purpose is to - * prevent a default operator= from hiding the templated operator=. - */ - TranspositionsWrapper& operator=(const TranspositionsWrapper& other) - { - m_indices = other.m_indices; - return *this; - } - #endif - /** const version of indices(). */ + EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; } /** \returns a reference to the stored array representing the transpositions. */ + EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; } protected: @@ -374,9 +349,12 @@ class Transpose > explicit Transpose(const TranspositionType& t) : m_transpositions(t) {} - Index size() const { return m_transpositions.size(); } - Index rows() const { return m_transpositions.size(); } - Index cols() const { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index size() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_transpositions.size(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_transpositions.size(); } /** \returns the \a matrix with the inverse transpositions applied to the columns. */ @@ -384,7 +362,7 @@ class Transpose > const Product operator*(const MatrixBase& matrix, const Transpose& trt) { - return Product(matrix.derived(), trt.derived()); + return Product(matrix.derived(), trt); } /** \returns the \a matrix with the inverse transpositions applied to the rows. @@ -395,7 +373,8 @@ class Transpose > { return Product(*this, matrix.derived()); } - + + EIGEN_DEVICE_FUNC const TranspositionType& nestedExpression() const { return m_transpositions; } protected: diff --git a/externals/eigen/Eigen/src/Core/TriangularMatrix.h b/externals/eigen/Eigen/src/Core/TriangularMatrix.h index 667ef09d..fdb8bc15 100644 --- a/externals/eigen/Eigen/src/Core/TriangularMatrix.h +++ b/externals/eigen/Eigen/src/Core/TriangularMatrix.h @@ -11,12 +11,12 @@ #ifndef EIGEN_TRIANGULARMATRIX_H #define EIGEN_TRIANGULARMATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct triangular_solve_retval; - + } /** \class TriangularBase @@ -34,16 +34,16 @@ template class TriangularBase : public EigenBase ColsAtCompileTime = internal::traits::ColsAtCompileTime, MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, - + SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, internal::traits::ColsAtCompileTime>::ret), /**< This is equal to the number of coefficients, i.e. the number of * rows times the number of columns, or to \a Dynamic if this is not * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ - + MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret) - + }; typedef typename internal::traits::Scalar Scalar; typedef typename internal::traits::StorageKind StorageKind; @@ -53,18 +53,19 @@ template class TriangularBase : public EigenBase typedef Derived const& Nested; EIGEN_DEVICE_FUNC - inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); } + inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return derived().cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index outerStride() const EIGEN_NOEXCEPT { return derived().outerStride(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index innerStride() const EIGEN_NOEXCEPT { return derived().innerStride(); } - EIGEN_DEVICE_FUNC - inline Index rows() const { return derived().rows(); } - EIGEN_DEVICE_FUNC - inline Index cols() const { return derived().cols(); } - EIGEN_DEVICE_FUNC - inline Index outerStride() const { return derived().outerStride(); } - EIGEN_DEVICE_FUNC - inline Index innerStride() const { return derived().innerStride(); } - // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -155,7 +156,7 @@ template class TriangularBase : public EigenBase * \param MatrixType the type of the object in which we are taking the triangular part * \param Mode the kind of triangular matrix expression to construct. Can be #Upper, * #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower. - * This is in fact a bit field; it must have either #Upper or #Lower, + * This is in fact a bit field; it must have either #Upper or #Lower, * and additionally it may have #UnitDiag or #ZeroDiag or neither. * * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular @@ -197,7 +198,8 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixConjugateReturnType; - + typedef TriangularView::type, _Mode> ConstTriangularView; + public: typedef typename internal::traits::StorageKind StorageKind; @@ -216,17 +218,15 @@ template class TriangularView EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} - - using Base::operator=; - TriangularView& operator=(const TriangularView &other) - { return Base::operator=(other); } + + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ - EIGEN_DEVICE_FUNC - inline Index rows() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } /** \copydoc EigenBase::cols() */ - EIGEN_DEVICE_FUNC - inline Index cols() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** \returns a const reference to the nested expression */ EIGEN_DEVICE_FUNC @@ -235,13 +235,25 @@ template class TriangularView /** \returns a reference to the nested expression */ EIGEN_DEVICE_FUNC NestedExpression& nestedExpression() { return m_matrix; } - + typedef TriangularView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_matrix.template conjugateIf()); + } + typedef TriangularView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -257,7 +269,7 @@ template class TriangularView typename MatrixType::TransposeReturnType tmp(m_matrix); return TransposeReturnType(tmp); } - + typedef TriangularView ConstTransposeReturnType; /** \sa MatrixBase::transpose() const */ EIGEN_DEVICE_FUNC @@ -268,10 +280,10 @@ template class TriangularView template EIGEN_DEVICE_FUNC - inline const Solve + inline const Solve solve(const MatrixBase& other) const { return Solve(*this, other.derived()); } - + // workaround MSVC ICE #if EIGEN_COMP_MSVC template @@ -315,7 +327,7 @@ template class TriangularView else return m_matrix.diagonal().prod(); } - + protected: MatrixTypeNested m_matrix; @@ -377,7 +389,7 @@ template class TriangularViewImpl<_Mat internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op()); return derived(); } - + /** \sa MatrixBase::operator*=() */ EIGEN_DEVICE_FUNC TriangularViewType& operator*=(const typename internal::traits::Scalar& other) { return *this = derived().nestedExpression() * other; } @@ -435,14 +447,14 @@ template class TriangularViewImpl<_Mat TriangularViewType& operator=(const TriangularViewImpl& other) { return *this = other.derived().nestedExpression(); } - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const TriangularBase& other); - /** \deprecated */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase& other); #endif @@ -470,7 +482,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -488,7 +500,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -497,7 +508,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ @@ -523,10 +534,10 @@ template class TriangularViewImpl<_Mat call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); } - /** \deprecated - * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ + /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ template - EIGEN_DEVICE_FUNC + /** \deprecated */ + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void swap(MatrixBase const & other) { EIGEN_STATIC_ASSERT_LVALUE(OtherDerived); @@ -544,6 +555,10 @@ template class TriangularViewImpl<_Mat template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl) + }; /*************************************************************************** @@ -554,7 +569,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -564,7 +579,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -573,7 +588,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -583,7 +598,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -598,7 +613,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -624,6 +639,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -633,6 +649,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -698,7 +715,7 @@ bool MatrixBase::isLowerTriangular(const RealScalar& prec) const namespace internal { - + // TODO currently a triangular expression has the form TriangularView<.,.> // in the future triangular-ness should be defined by the expression traits // such that Transpose > is valid. (currently TriangularBase::transpose() is overloaded to make it work) @@ -715,6 +732,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; @@ -726,7 +744,7 @@ struct Dense2Triangular {}; template struct triangular_assignment_loop; - + /** \internal Specialization of the dense assignment kernel for triangular matrices. * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions. * \tparam UpLo must be either Lower or Upper @@ -743,17 +761,17 @@ class triangular_dense_assignment_kernel : public generic_dense_assignment_kerne using Base::m_src; using Base::m_functor; public: - + typedef typename Base::DstEvaluatorType DstEvaluatorType; typedef typename Base::SrcEvaluatorType SrcEvaluatorType; typedef typename Base::Scalar Scalar; typedef typename Base::AssignmentTraits AssignmentTraits; - - + + EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} - + #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) { @@ -763,16 +781,16 @@ class triangular_dense_assignment_kernel : public generic_dense_assignment_kerne #else using Base::assignCoeff; #endif - + EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1)); else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0)); else if(Mode==0) Base::assignCoeff(id,id); } - + EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col) - { + { eigen_internal_assert(row!=col); if(SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0)); @@ -793,17 +811,17 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); DstEvaluatorType dstEvaluator(dst); - + typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite, DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - + enum { unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost - && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT + && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT }; - + triangular_assignment_loop::run(kernel); } @@ -825,8 +843,8 @@ struct Assignment EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode)); - - call_triangular_assignment_loop(dst, src, func); + + call_triangular_assignment_loop(dst, src, func); } }; @@ -835,7 +853,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -844,7 +862,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -855,19 +873,19 @@ struct triangular_assignment_loop // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; - + enum { col = (UnrollCount-1) / DstXprType::RowsAtCompileTime, row = (UnrollCount-1) % DstXprType::RowsAtCompileTime }; - + typedef typename Kernel::Scalar Scalar; EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) { triangular_assignment_loop::run(kernel); - + if(row==col) kernel.assignDiagonalCoeff(row); else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row } else i = maxi; - + if(i * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); - internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); + internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); } namespace internal { - + // Triangular = Product template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> struct Assignment, internal::assign_op::Scalar>, Dense2Triangular> @@ -950,7 +968,7 @@ struct Assignment, internal::assign_ if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) dst.resize(dstRows, dstCols); - dst._assignProduct(src, 1, 0); + dst._assignProduct(src, Scalar(1), false); } }; @@ -961,7 +979,7 @@ struct Assignment, internal::add_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { - dst._assignProduct(src, 1, 1); + dst._assignProduct(src, Scalar(1), true); } }; @@ -972,7 +990,7 @@ struct Assignment, internal::sub_ass typedef Product SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { - dst._assignProduct(src, -1, 1); + dst._assignProduct(src, Scalar(-1), true); } }; diff --git a/externals/eigen/Eigen/src/Core/VectorBlock.h b/externals/eigen/Eigen/src/Core/VectorBlock.h index d72fbf7e..71c5b95e 100644 --- a/externals/eigen/Eigen/src/Core/VectorBlock.h +++ b/externals/eigen/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly maniputate sub-vector expressions, + * However, if you want to directly manipulate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * @@ -71,8 +71,8 @@ template class VectorBlock /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start, Index size) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start, Index size) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) @@ -82,8 +82,8 @@ template class VectorBlock /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC - inline VectorBlock(VectorType& vector, Index start) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); diff --git a/externals/eigen/Eigen/src/Core/VectorwiseOp.h b/externals/eigen/Eigen/src/Core/VectorwiseOp.h index 4fe267e9..870f4f1e 100644 --- a/externals/eigen/Eigen/src/Core/VectorwiseOp.h +++ b/externals/eigen/Eigen/src/Core/VectorwiseOp.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2010 Gael Guennebaud +// Copyright (C) 2008-2019 Gael Guennebaud // Copyright (C) 2006-2008 Benoit Jacob // // This Source Code Form is subject to the terms of the Mozilla @@ -65,10 +65,10 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr \ - struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ - typedef ResultType result_type; \ - template struct Cost \ - { enum { value = COST }; }; \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - ResultType operator()(const XprType& mat) const \ - { return mat.MEMBER(); } \ +template struct partial_redux_dummy_func; + +#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + typedef BINARYOP BinaryOp; \ + template struct Cost { enum { value = COST }; }; \ + enum { Vectorizable = VECTORIZABLE }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ + BinaryOp binaryFunc() const { return BinaryOp(); } \ } +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func) + namespace internal { -EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); -EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); -EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); -EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); -template +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits::AddCost, 1, internal::scalar_sum_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_min_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_max_op); +EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits::MulCost, 1, internal::scalar_product_op); + +template struct member_lpnorm { typedef ResultType result_type; - template struct Cost + enum { Vectorizable = 0 }; + template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; EIGEN_DEVICE_FUNC member_lpnorm() {} template @@ -121,17 +128,20 @@ struct member_lpnorm { { return mat.template lpNorm

(); } }; -template +template struct member_redux { + typedef BinaryOpT BinaryOp; typedef typename result_of< BinaryOp(const Scalar&,const Scalar&) >::type result_type; - template struct Cost - { enum { value = (Size-1) * functor_traits::Cost }; }; + + enum { Vectorizable = functor_traits::PacketAccess }; + template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} template EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const { return mat.redux(m_functor); } + const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; } @@ -139,18 +149,38 @@ struct member_redux { /** \class VectorwiseOp * \ingroup Core_Module * - * \brief Pseudo expression providing partial reduction operations + * \brief Pseudo expression providing broadcasting and partial reduction operations * * \tparam ExpressionType the type of the object on which to do partial reductions - * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) + * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal) * - * This class represents a pseudo expression with partial reduction features. + * This class represents a pseudo expression with broadcasting and partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() - * and most of the time this is the only way it is used. + * and most of the time this is the only way it is explicitly used. + * + * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()` + * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each + * column of `A` and then re-assemble the outputs in a matrix expression: + * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out * + * The begin() and end() methods are obviously exceptions to the previous rule as they + * return STL-compatible begin/end iterators to the rows or columns of the nested expression. + * Typical use cases include for-range-loop and calls to STL algorithms: + * + * Example: \include MatrixBase_colwise_iterator_cxx11.cpp + * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out + * + * For a partial reduction on an empty input, some rules apply. + * For the sake of clarity, let's consider a vertical reduction: + * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. + * - Otherwise, if the number of rows is zero, then + * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) + * - a row vector of ones is returned for a product reduction (e.g., MatrixXd(n,0).colwise().prod()) + * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) + * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template class VectorwiseOp @@ -163,11 +193,11 @@ template class VectorwiseOp typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; - template class Functor, - typename Scalar_=Scalar> struct ReturnType + template class Functor, + typename ReturnScalar=Scalar> struct ReturnType { typedef PartialReduxExpr, + Functor, Direction > Type; }; @@ -187,23 +217,6 @@ template class VectorwiseOp protected: - typedef typename internal::conditional::type SubVector; - /** \internal - * \returns the i-th subvector according to the \c Direction */ - EIGEN_DEVICE_FUNC - SubVector subVector(Index i) - { - return SubVector(m_matrix.derived(),i); - } - - /** \internal - * \returns the number of subvectors in the direction \c Direction */ - EIGEN_DEVICE_FUNC - Index subVectors() const - { return isVertical?m_matrix.cols():m_matrix.rows(); } - template struct ExtendedType { typedef Replicate class VectorwiseOp EIGEN_DEVICE_FUNC inline const ExpressionType& _expression() const { return m_matrix; } + #ifdef EIGEN_PARSED_BY_DOXYGEN + /** STL-like RandomAccessIterator + * iterator type over the columns or rows as returned by the begin() and end() methods. + */ + random_access_iterator_type iterator; + /** This is the const version of iterator (aka read-only) */ + random_access_iterator_type const_iterator; + #else + typedef internal::subvector_stl_iterator iterator; + typedef internal::subvector_stl_iterator const_iterator; + typedef internal::subvector_stl_reverse_iterator reverse_iterator; + typedef internal::subvector_stl_reverse_iterator const_reverse_iterator; + #endif + + /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression. + * \sa end(), cbegin() + */ + iterator begin() { return iterator (m_matrix, 0); } + /** const version of begin() */ + const_iterator begin() const { return const_iterator(m_matrix, 0); } + /** const version of begin() */ + const_iterator cbegin() const { return const_iterator(m_matrix, 0); } + + /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression. + * \sa rend(), crbegin() + */ + reverse_iterator rbegin() { return reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator rbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + /** const version of rbegin() */ + const_reverse_iterator crbegin() const { return const_reverse_iterator (m_matrix, m_matrix.template subVectors()-1); } + + /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression + * \sa begin(), cend() + */ + iterator end() { return iterator (m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator end() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + /** const version of end() */ + const_iterator cend() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } + + /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression + * \sa begin(), cend() + */ + reverse_iterator rend() { return reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator rend() const { return const_reverse_iterator (m_matrix, -1); } + /** const version of rend() */ + const_reverse_iterator crend() const { return const_reverse_iterator (m_matrix, -1); } + /** \returns a row or column vector expression of \c *this reduxed by \a func * * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template EIGEN_DEVICE_FUNC const typename ReduxReturnType::Type redux(const BinaryOp& func = BinaryOp()) const - { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); + } typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; - typedef typename ReturnType::Type SquaredNormReturnType; - typedef typename ReturnType::Type NormReturnType; + typedef PartialReduxExpr, const ExpressionTypeNestedCleaned>,internal::member_sum,Direction> SquaredNormReturnType; + typedef CwiseUnaryOp, const SquaredNormReturnType> NormReturnType; typedef typename ReturnType::Type BlueNormReturnType; typedef typename ReturnType::Type StableNormReturnType; typedef typename ReturnType::Type HypotNormReturnType; typedef typename ReturnType::Type SumReturnType; - typedef typename ReturnType::Type MeanReturnType; + typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType; typedef typename ReturnType::Type AllReturnType; typedef typename ReturnType::Type AnyReturnType; - typedef PartialReduxExpr, Direction> CountReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; typedef typename ReturnType::Type ProdReturnType; typedef Reverse ConstReverseReturnType; typedef Reverse ReverseReturnType; template struct LpNormReturnType { - typedef PartialReduxExpr,Direction> Type; + typedef PartialReduxExpr,Direction> Type; }; /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -302,11 +374,17 @@ template class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { return MinCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MinCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * + * \warning the size along the reduction direction must be strictly positive, + * otherwise an assertion is triggered. + * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -315,7 +393,10 @@ template class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { return MaxCoeffReturnType(_expression()); } + { + eigen_assert(redux_length()>0 && "you are using an empty matrix"); + return MaxCoeffReturnType(_expression()); + } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -327,7 +408,7 @@ template class VectorwiseOp * \sa DenseBase::squaredNorm() */ EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const - { return SquaredNormReturnType(_expression()); } + { return SquaredNormReturnType(m_matrix.cwiseAbs2()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -339,7 +420,7 @@ template class VectorwiseOp * \sa DenseBase::norm() */ EIGEN_DEVICE_FUNC const NormReturnType norm() const - { return NormReturnType(_expression()); } + { return NormReturnType(squaredNorm()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -404,7 +485,7 @@ template class VectorwiseOp * \sa DenseBase::mean() */ EIGEN_DEVICE_FUNC const MeanReturnType mean() const - { return MeanReturnType(_expression()); } + { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); } /** \returns a row (or column) vector expression representing * whether \b all coefficients of each respective column (or row) are \c true. @@ -500,7 +581,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return const_cast(m_matrix = extendedTo(other.derived())); + return m_matrix = extendedTo(other.derived()); } /** Adds the vector \a other to each subvector of \c *this */ @@ -510,7 +591,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix += extendedTo(other.derived())); + return m_matrix += extendedTo(other.derived()); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -520,7 +601,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return const_cast(m_matrix -= extendedTo(other.derived())); + return m_matrix -= extendedTo(other.derived()); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -532,7 +613,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Divides each subvector of \c *this by the vector \a other */ @@ -544,7 +625,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return const_cast(m_matrix); + return m_matrix; } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ @@ -609,7 +690,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type>::Type> + const typename OppositeExtendedType::Type> normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } @@ -658,7 +739,15 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC const HNormalizedReturnType hnormalized() const; +# ifdef EIGEN_VECTORWISEOP_PLUGIN +# include EIGEN_VECTORWISEOP_PLUGIN +# endif + protected: + Index redux_length() const + { + return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); + } ExpressionTypeNested m_matrix; }; @@ -670,7 +759,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +773,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); diff --git a/externals/eigen/Eigen/src/Core/Visitor.h b/externals/eigen/Eigen/src/Core/Visitor.h index 54c1883d..00bcca87 100644 --- a/externals/eigen/Eigen/src/Core/Visitor.h +++ b/externals/eigen/Eigen/src/Core/Visitor.h @@ -10,7 +10,7 @@ #ifndef EIGEN_VISITOR_H #define EIGEN_VISITOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -40,6 +40,14 @@ struct visitor_impl } }; +// This specialization enables visitors on empty matrices at compile-time +template +struct visitor_impl { + EIGEN_DEVICE_FUNC + static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) + {} +}; + template struct visitor_impl { @@ -62,22 +70,22 @@ class visitor_evaluator public: EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} - + typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; - + enum { RowsAtCompileTime = XprType::RowsAtCompileTime, CoeffReadCost = internal::evaluator::CoeffReadCost }; - - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_xpr.size(); } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const { return m_evaluator.coeff(row, col); } - + protected: internal::evaluator m_evaluator; const XprType &m_xpr; @@ -99,6 +107,8 @@ class visitor_evaluator * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. * + * \note if the matrix is empty, then the visitor is left unchanged. + * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ template @@ -106,12 +116,15 @@ template EIGEN_DEVICE_FUNC void DenseBase::visit(Visitor& visitor) const { + if(size()==0) + return; + typedef typename internal::visitor_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - + enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits::Cost <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits::Cost) <= EIGEN_UNROLLING_LIMIT }; return internal::visitor_impl::run(thisEval, visitor); } @@ -124,6 +137,9 @@ namespace internal { template struct coeff_visitor { + // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + EIGEN_DEVICE_FUNC + coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; @@ -141,7 +157,7 @@ struct coeff_visitor * * \sa DenseBase::minCoeff(Index*, Index*) */ -template +template struct min_coeff_visitor : coeff_visitor { typedef typename Derived::Scalar Scalar; @@ -157,8 +173,40 @@ struct min_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value < this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct min_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value < this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template + struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -169,10 +217,10 @@ struct functor_traits > { * * \sa DenseBase::maxCoeff(Index*, Index*) */ -template +template struct max_coeff_visitor : coeff_visitor { - typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC void operator() (const Scalar& value, Index i, Index j) { @@ -185,8 +233,40 @@ struct max_coeff_visitor : coeff_visitor } }; -template -struct functor_traits > { +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(this->res) || (!(numext::isnan)(value) && value > this->res)) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct max_coeff_visitor : coeff_visitor +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + void operator() (const Scalar& value, Index i, Index j) + { + if((numext::isnan)(value) || value > this->res) + { + this->res = value; + this->row = i; + this->col = j; + } + } +}; + +template +struct functor_traits > { enum { Cost = NumTraits::AddCost }; @@ -196,17 +276,24 @@ struct functor_traits > { /** \fn DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { - internal::min_coeff_visitor minVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; if (colId) *colId = minVisitor.col; @@ -214,18 +301,25 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::min_coeff_visitor minVisitor; + internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row); return minVisitor.res; @@ -233,17 +327,24 @@ DenseBase::minCoeff(IndexType* index) const /** \fn DenseBase::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { - internal::max_coeff_visitor maxVisitor; + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; if (colPtr) *colPtr = maxVisitor.col; @@ -251,18 +352,25 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. - * \warning the result is undefined if \c *this contains NaN. + * + * In case \c *this contains NaN, NaNPropagation determines the behavior: + * NaNPropagation == PropagateFast : undefined + * NaNPropagation == PropagateNaN : result is NaN + * NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN + * \warning the matrix must be not empty, otherwise an assertion is triggered. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() */ template -template +template EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* index) const { + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - internal::max_coeff_visitor maxVisitor; + internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row; return maxVisitor.res; diff --git a/externals/eigen/Eigen/src/Core/arch/AVX/Complex.h b/externals/eigen/Eigen/src/Core/arch/AVX/Complex.h index 99439c8a..ab7bd6c6 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX/Complex.h @@ -22,6 +22,7 @@ struct Packet4cf __m256 v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -37,6 +38,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -44,8 +46,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet8f as_real; + enum { + size=4, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -67,10 +81,17 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } +template <> +EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { + __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); + return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } @@ -140,87 +161,13 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe Packet2cf(_mm256_extractf128_ps(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) -{ - Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t0 = _mm256_hadd_ps(t0,t1); - Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); - Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); - t2 = _mm256_hadd_ps(t2,t3); - - t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); - t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); - - return Packet4cf(_mm256_add_ps(t1,t3)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) { return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const - { return Packet4cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const - { return Packet4cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { @@ -244,6 +191,7 @@ struct Packet2cd __m256d v; }; +#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -259,6 +207,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -266,8 +215,20 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet4d as_real; + enum { + size=2, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } @@ -288,10 +249,17 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } +template <> +EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { + __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); + return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } @@ -343,80 +311,13 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) -{ - Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); - Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); - - return Packet2cd(_mm256_add_pd(t0,t1)); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) { return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) - { - if (Offset==0) return; - palign_impl::run(first.v, second.v); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const - { return Packet2cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const - { return Packet2cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { @@ -456,24 +357,12 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); -} - -template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); +template<> EIGEN_STRONG_INLINE Packet2cd psqrt(const Packet2cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) -{ - return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); -} - -template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) -{ - return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +template<> EIGEN_STRONG_INLINE Packet4cf psqrt(const Packet4cf& a) { + return psqrt_complex(a); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h index 6af67ce2..67041c81 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin, cos, exp, and log functions of this file are loosely derived from +/* The sin and cos functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -18,187 +18,50 @@ namespace Eigen { namespace internal { -inline Packet8i pshiftleft(Packet8i v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(v, n); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psin(const Packet8f& _x) { + return psin_float(_x); } -inline Packet8f pshiftright(Packet8f v, int n) -{ -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); - return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); -#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pcos(const Packet8f& _x) { + return pcos_float(_x); } -// Sine function -// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and -// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants -// are (anti-)symmetric and thus have only odd/even coefficients template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psin(const Packet8f& _x) { - Packet8f x = _x; +plog(const Packet8f& _x) { + return plog_float(_x); +} - // Some useful values. - _EIGEN_DECLARE_CONST_Packet8i(one, 1); - _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); - _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); - _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); - _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); - - // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. - Packet8f z = pmul(x, p8f_one_over_pi); - Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); - x = pmadd(shift, p8f_neg_pi_first, x); - x = pmadd(shift, p8f_neg_pi_second, x); - x = pmadd(shift, p8f_neg_pi_third, x); - z = pmul(x, p8f_four_over_pi); - - // Make a mask for the entries that need flipping, i.e. wherever the shift - // is odd. - Packet8i shift_ints = _mm256_cvtps_epi32(shift); - Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); - Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); - - // Create a mask for which interpolant to use, i.e. if z > 1, then the mask - // is set to ones for that entry. - Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); - - // Evaluate the polynomial for the interval [1,3] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); - Packet8f z_minus_two = psub(z, p8f_two); - Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); - Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); - right = pmadd(right, z_minus_two2, p8f_coeff_right_2); - right = pmadd(right, z_minus_two2, p8f_coeff_right_0); - - // Evaluate the polynomial for the interval [-1,1] in z. - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); - _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); - Packet8f z2 = pmul(z, z); - Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); - left = pmadd(left, z2, p8f_coeff_left_3); - left = pmadd(left, z2, p8f_coeff_left_1); - left = pmul(left, z); - - // Assemble the results, i.e. select the left and right polynomials. - left = _mm256_andnot_ps(ival_mask, left); - right = _mm256_and_ps(ival_mask, right); - Packet8f res = _mm256_or_ps(left, right); - - // Flip the sign on the odd intervals and return the result. - res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); - return res; +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog(const Packet4d& _x) { + return plog_double(_x); } -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -plog(const Packet8f& _x) { - Packet8f x = _x; - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); - - Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p8f_min_norm_pos); - - Packet8f emm0 = pshiftright(x,23); - Packet8f e = _mm256_sub_ps(emm0, p8f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_ps(x, p8f_inv_mant_mask); - x = _mm256_or_ps(x, p8f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); - Packet8f tmp = _mm256_and_ps(x, mask); - x = psub(x, p8f_1); - e = psub(e, _mm256_and_ps(p8f_1, mask)); - x = padd(x, tmp); - - Packet8f x2 = pmul(x, x); - Packet8f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet8f y, y1, y2; - y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); - y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); - y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); - y = pmadd(y, x, p8f_cephes_log_p2); - y1 = pmadd(y1, x, p8f_cephes_log_p5); - y2 = pmadd(y2, x, p8f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p8f_cephes_log_q1); - tmp = pmul(x2, p8f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p8f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm256_or_ps( - _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), - _mm256_and_ps(iszero_mask, p8f_minus_inf)); +plog2(const Packet8f& _x) { + return plog2_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +plog2(const Packet4d& _x) { + return plog2_double(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f plog1p(const Packet8f& _x) { + return generic_plog1p(_x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f pexpm1(const Packet8f& _x) { + return generic_expm1(_x); } // Exponential function. Works by writing "x = m*log(2) + r" where @@ -207,149 +70,21 @@ plog(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp(const Packet8f& _x) { - _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); - - _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); - - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); - - // Clamp x. - Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); - -// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is -// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating -// truncation errors. Note that we don't use the "pmadd" function here to -// ensure that a precision-preserving FMA instruction is used. -#ifdef EIGEN_VECTORIZE_FMA - _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); - Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); -#else - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); - Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); - r = psub(r, pmul(m, p8f_cephes_exp_C2)); -#endif - - Packet8f r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet8f y = p8f_cephes_exp_p0; - y = pmadd(y, r, p8f_cephes_exp_p1); - y = pmadd(y, r, p8f_cephes_exp_p2); - y = pmadd(y, r, p8f_cephes_exp_p3); - y = pmadd(y, r, p8f_cephes_exp_p4); - y = pmadd(y, r, p8f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p8f_1); - - // Build emm0 = 2^m. - Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); - emm0 = pshiftleft(emm0, 23); - - // Return 2^m * exp(r). - return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); + return pexp_float(_x); } // Hyperbolic Tangent function. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -ptanh(const Packet8f& x) { - return internal::generic_fast_tanh_float(x); +ptanh(const Packet8f& _x) { + return internal::generic_fast_tanh_float(_x); } +// Exponential function for doubles. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d pexp(const Packet4d& _x) { - Packet4d x = _x; - - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - - Packet4d tmp, fx; - - // clamp x - x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = _mm256_floor_pd(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, p4d_cephes_exp_C1); - Packet4d z = pmul(fx, p4d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet4d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet4d px = p4d_cephes_exp_p0; - px = pmadd(px, x2, p4d_cephes_exp_p1); - px = pmadd(px, x2, p4d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet4d qx = p4d_cephes_exp_q0; - qx = pmadd(qx, x2, p4d_cephes_exp_q1); - qx = pmadd(qx, x2, p4d_cephes_exp_q2); - qx = pmadd(qx, x2, p4d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm256_div_pd(px, psub(qx, px)); - x = pmadd(p4d_2, x, p4d_1); - - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - __m128i emm0 = _mm256_cvtpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); + return pexp_double(_x); } // Functions for sqrt. @@ -362,37 +97,39 @@ pexp(const Packet4d& _x) { // For detail see here: http://www.beyond3d.com/content/articles/8/ #if EIGEN_FAST_MATH template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -psqrt(const Packet8f& _x) { - Packet8f half = pmul(_x, pset1(.5f)); - Packet8f denormal_mask = _mm256_and_ps( - _mm256_cmp_ps(_x, pset1((std::numeric_limits::min)()), - _CMP_LT_OQ), - _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt(const Packet8f& _x) { + Packet8f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet8f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet8f x = _mm256_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } + #else + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f psqrt(const Packet8f& x) { - return _mm256_sqrt_ps(x); +Packet8f psqrt(const Packet8f& _x) { + return _mm256_sqrt_ps(_x); } + #endif + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d psqrt(const Packet4d& x) { - return _mm256_sqrt_pd(x); +Packet4d psqrt(const Packet4d& _x) { + return _mm256_sqrt_pd(_x); } -#if EIGEN_FAST_MATH +#if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); @@ -401,36 +138,88 @@ Packet8f prsqrt(const Packet8f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). - Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); - Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); - Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), - _mm256_and_ps(zero_mask, p8f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm256_or_ps(x, infs_and_nans); + Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); + Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); + + // Compute an approximate result using the rsqrt intrinsic. + Packet8f y_approx = _mm256_rsqrt_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f prsqrt(const Packet8f& x) { +Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); - return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x)); } #endif template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4d prsqrt(const Packet4d& x) { +Packet4d prsqrt(const Packet4d& _x) { _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); - return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x)); } +F16_PACKET_FUNCTION(Packet8f, Packet8h, psin) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2) +F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1) +F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp) +F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh) +F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt) +F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) { + Packet8f fexponent; + const Packet8h out = float2half(pfrexp(half2float(a), fexponent)); + exponent = float2half(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) { + return float2half(pldexp(half2float(a), half2float(exponent))); +} + +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt) + +template <> +EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) { + Packet8f fexponent; + const Packet8bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/AVX/PacketMath.h index 195d40fb..7fc32fd7 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX/PacketMath.h @@ -18,11 +18,11 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -31,10 +31,14 @@ namespace internal { typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; +typedef eigen_packet_wrapper<__m128i, 2> Packet8h; +typedef eigen_packet_wrapper<__m128i, 3> Packet8bf; template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ const Packet8f p8f_##NAME = pset1(X) @@ -58,21 +62,28 @@ template<> struct packet_traits : default_packet_traits enum { Vectorizable = 1, AlignedOnScalar = 1, - size=8, + size = 8, HasHalfPacket = 1, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = 0, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasNdtri = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -85,14 +96,104 @@ template<> struct packet_traits : default_packet_traits size=4, HasHalfPacket = 1, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1 + HasCeil = 1, + HasRint = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8bf type; + // There is no half-size packet for current Packet8bf. + // TODO: support as SSE path. + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 }; }; #endif @@ -113,14 +214,45 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet8i integer_packet; + typedef uint8_t mask_t; + enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; }; +template<> struct unpacket_traits { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 16x16 to 8x16. +EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) { + return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), + _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); +} + template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet4d pset1frombits(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); } + + +template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); } + template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -129,9 +261,27 @@ template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { retur template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi32(a,b); +#else + __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi32(a,b); +#else + __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { @@ -148,7 +298,15 @@ template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } - +template<> EIGEN_STRONG_INLINE Packet8i pmul(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_mullo_epi32(a,b); +#else + const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } @@ -157,13 +315,14 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) - // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and gcc stupidly generates a vfmadd132ps instruction, - // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate - // the result of the product. +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) + // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and even register spilling with clang>=6.0 (bug 1637). + // Gcc stupidly generates a vfmadd132ps instruction. + // So let's enforce it to generate a vfmadd231ps instruction since the most common use + // case is to accumulate the result of the product. Packet8f res = c; __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; @@ -172,7 +331,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); @@ -183,14 +342,112 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& } #endif -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } + +template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); } +template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } + + +template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi32(a,b); +#else + __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may flip + // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet8f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_pd(b,a); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet8f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_ps(b,a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // See pmin above + Packet4d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_pd(b,a); +#endif +} -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { + return pminmax_propagate_nan(a, b, pmax); +} -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8f print(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d print(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } @@ -198,17 +455,124 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8i ptrue(const Packet8i& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + return _mm256_cmpeq_epi32(a,a); +#else + const __m256 b = _mm256_castsi256_ps(a); + return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqd has lower latency than the more general vcmpps + const __m256i b = _mm256_castps_si256(a); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); +#else + return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4d ptrue(const Packet4d& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + // vpcmpeqq has lower latency than the more general vcmppd + const __m256i b = _mm256_castpd_si256(a); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); +#else + return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); +#endif +} + template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a,b); +#else + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a,b); +#else + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a,b); +#else + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); +#endif +} -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(b,a); +#else + return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) +{ + const Packet8f mask = pset1frombits(static_cast(0x80000000u)); + const Packet8f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) +{ + const Packet4d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet4d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) +{ return _mm256_blendv_ps(b,a,mask); } +template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) +{ return _mm256_blendv_pd(b,a,mask); } + +template EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srai_epi32(a, N); +#else + __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi32(a, N); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +template EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(a, N); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } @@ -218,6 +582,14 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); +} + // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { @@ -225,7 +597,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp); - + // _mm256_insertf128_ps is very slow on Haswell, thus: Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); // mimic an "inplace" permutation of the lower 128bits using a blend @@ -255,6 +627,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { + Packet8i mask = _mm256_set1_epi8(static_cast(umask)); + const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + mask = por(mask, bit_mask); + mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); + EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +} + // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) @@ -308,9 +688,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { @@ -333,9 +713,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) { __m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); - + #if 0 + // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd + // exhibit the same latency/throughput, but it is here for future reference/benchmarking... __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); + #endif } // pabs should be ok @@ -350,47 +733,66 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } -// preduxp should be ok -// FIXME: why is this ok? why isn't the simply implementation working as expected? -template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) -{ - __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); - __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); - __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); - __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); +template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { + return pfrexp_generic(a,exponent); +} - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); +// Extract exponent without existence of Packet4l. +template<> +EIGEN_STRONG_INLINE +Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) { + const Packet4d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask)); +#ifdef EIGEN_VECTORIZE_AVX2 + a_expo = _mm256_srli_epi64(a_expo, 52); + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); +#else + __m128i lo = _mm256_extractf128_si256(a_expo, 0); + __m128i hi = _mm256_extractf128_si256(a_expo, 1); + lo = _mm_srli_epi64(lo, 52); + hi = _mm_srli_epi64(hi, 52); +#endif + Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3)); + Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3)); + Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0); + exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1); + return exponent; +} - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - return final; +template<> EIGEN_STRONG_INLINE Packet4d pfrexp(const Packet4d& a, Packet4d& exponent) { + return pfrexp_generic(a, exponent); } -template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) -{ - Packet4d tmp0, tmp1; - tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); +template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { + return pldexp_generic(a, exponent); +} - return _mm256_blend_pd(tmp0, tmp1, 0xC); +template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet4d max_exponent = pset1(2099.0); + const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet4i bias = pset1(1023); + Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + Packet4i lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) @@ -402,7 +804,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } @@ -446,93 +848,16 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) +// { +// return _mm256_movemask_ps(x)==0xFF; +// } -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) { - static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) - { - if (Offset==1) - { - first = _mm256_blend_ps(first, second, 1); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0x88); - } - else if (Offset==2) - { - first = _mm256_blend_ps(first, second, 3); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xcc); - } - else if (Offset==3) - { - first = _mm256_blend_ps(first, second, 7); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_blend_ps(tmp1, tmp2, 0xee); - } - else if (Offset==4) - { - first = _mm256_blend_ps(first, second, 15); - Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); - Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); - first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); - } - else if (Offset==5) - { - first = _mm256_blend_ps(first, second, 31); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0x88); - } - else if (Offset==6) - { - first = _mm256_blend_ps(first, second, 63); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xcc); - } - else if (Offset==7) - { - first = _mm256_blend_ps(first, second, 127); - first = _mm256_permute2f128_ps(first, first, 1); - Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); - first = _mm256_permute2f128_ps(tmp, tmp, 1); - first = _mm256_blend_ps(tmp, first, 0xee); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) - { - if (Offset==1) - { - first = _mm256_blend_pd(first, second, 1); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 0xA); - } - else if (Offset==2) - { - first = _mm256_blend_pd(first, second, 3); - first = _mm256_permute2f128_pd(first, first, 1); - } - else if (Offset==3) - { - first = _mm256_blend_pd(first, second, 7); - __m256d tmp = _mm256_permute_pd(first, 5); - first = _mm256_permute2f128_pd(tmp, tmp, 1); - first = _mm256_blend_pd(tmp, first, 5); - } - } -}; + return _mm256_movemask_ps(x)!=0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -606,24 +931,640 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +// Packet math for Eigen::half + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + return _mm_set1_epi16(numext::bit_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + return _mm_load_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + const numext::uint16_t s0 = numext::bit_cast(Eigen::half(aux[0])); + const numext::uint16_t s1 = numext::bit_cast(Eigen::half(aux[1])); + const numext::uint16_t s2 = numext::bit_cast(Eigen::half(aux[2])); + const numext::uint16_t s3 = numext::bit_cast(Eigen::half(aux[3])); + const numext::uint16_t s4 = numext::bit_cast(Eigen::half(aux[4])); + const numext::uint16_t s5 = numext::bit_cast(Eigen::half(aux[5])); + const numext::uint16_t s6 = numext::bit_cast(Eigen::half(aux[6])); + const numext::uint16_t s7 = numext::bit_cast(Eigen::half(aux[7])); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmin(const Packet8h& a, + const Packet8h& b) { + return float2half(pmin(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h pmax(const Packet8h& a, + const Packet8h& b) { + return float2half(pmax(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8h plset(const half& a) { + return float2half(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { + // in some cases Packet4i is a wrapper around __m128i, so we either need to + // cast to Packet4i to directly call the intrinsics as below: + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h pround(const Packet8h& a) { + return float2half(pround(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h print(const Packet8h& a) { + return float2half(print(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pceil(const Packet8h& a) { + return float2half(pceil(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pfloor(const Packet8h& a) { + return float2half(pfloor(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_eq(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_le(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) { + return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + Packet8h sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) { - return _mm256_blend_ps(a,pset1(b),1); + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); } -template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) { - return _mm256_blend_pd(a,pset1(b),1); + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); } -template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) { - return _mm256_blend_ps(a,pset1(b),(1<<7)); + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0] = a0b0c0d0e0f0g0h0; + kernel.packet[1] = a1b1c1d1e1f1g1h1; + kernel.packet[2] = a2b2c2d2e2f2g2h2; + kernel.packet[3] = a3b3c3d3e3f3g3h3; + kernel.packet[4] = a4b4c4d4e4f4g4h4; + kernel.packet[5] = a5b5c5d5e5f5g5h5; + kernel.packet[6] = a6b6c6d6e6f6g6h6; + kernel.packet[7] = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + +// BFloat16 implementation. + +EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { +#ifdef EIGEN_VECTORIZE_AVX2 + __m256i extend = _mm256_cvtepu16_epi32(a); + return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16)); +#else + __m128i lo = _mm_cvtepu16_epi32(a); + __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8)); + __m128i lo_shift = _mm_slli_epi32(lo, 16); + __m128i hi_shift = _mm_slli_epi32(hi, 16); + return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1)); +#endif } -template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { + Packet8bf r; + + __m256i input = _mm256_castps_si256(a); + +#ifdef EIGEN_VECTORIZE_AVX2 + // uint32_t lsb = (input >> 16); + __m256i t = _mm256_srli_epi32(input, 16); + // uint32_t lsb = lsb & 1; + t = _mm256_and_si256(t, _mm256_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff)); + // input += rounding_bias; + t = _mm256_add_epi32(t, input); + // input = input >> 16; + t = _mm256_srli_epi32(t, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m256i nan = _mm256_set1_epi32(0x7fc0); + t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); + // output = numext::bit_cast(input); + return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), + _mm256_extractf128_si256(t, 1)); +#else + // uint32_t lsb = (input >> 16); + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16); + // uint32_t lsb = lsb & 1; + lo = _mm_and_si128(lo, _mm_set1_epi32(1)); + hi = _mm_and_si128(hi, _mm_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff)); + hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff)); + // input += rounding_bias; + lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0)); + hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1)); + // input = input >> 16; + lo = _mm_srli_epi32(lo, 16); + hi = _mm_srli_epi32(hi, 16); + // Check NaN before converting back to bf16 + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); + __m128i nan = _mm_set1_epi32(0x7fc0); + lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask))); + hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); + // output = numext::bit_cast(input); + return _mm_packus_epi32(lo, hi); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8bf pset1(const bfloat16& from) { + return _mm_set1_epi16(numext::bit_cast(from)); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& from) { + return numext::bit_cast(static_cast(_mm_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* from) { + return _mm_load_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploadu(const bfloat16* from) { + return _mm_loadu_si128(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet8bf& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet8bf& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploaddup(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + const numext::uint16_t c = numext::bit_cast(from[2]); + const numext::uint16_t d = numext::bit_cast(from[3]); + return _mm_set_epi16(d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf +ploadquad(const bfloat16* from) { + const numext::uint16_t a = numext::bit_cast(from[0]); + const numext::uint16_t b = numext::bit_cast(from[1]); + return _mm_set_epi16(b, b, b, b, a, a, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) { + return _mm_cmpeq_epi32(a, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { + const __m128i sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_andnot_si128(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmin(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, + const Packet8bf& b) { + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) { + return _mm_or_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) { + return _mm_xor_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) { + return _mm_and_si128(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) { + return _mm_andnot_si128(b,a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) { + return _mm_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pround(const Packet8bf& a) { - return _mm256_blend_pd(a,pset1(b),(1<<3)); + return F32ToBf16(pround(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf print(const Packet8bf& a) { + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pceil(const Packet8bf& a) { + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pfloor(const Packet8bf& a) { + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) { + return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) { + Packet8bf sign_mask = _mm_set1_epi16(static_cast(0x8000)); + return _mm_xor_si128(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf padd(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmul(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pdiv(const Packet8bf& a, const Packet8bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + + +template<> EIGEN_STRONG_INLINE Packet8bf pgather(const bfloat16* from, Index stride) +{ + const numext::uint16_t s0 = numext::bit_cast(from[0*stride]); + const numext::uint16_t s1 = numext::bit_cast(from[1*stride]); + const numext::uint16_t s2 = numext::bit_cast(from[2*stride]); + const numext::uint16_t s3 = numext::bit_cast(from[3*stride]); + const numext::uint16_t s4 = numext::bit_cast(from[4*stride]); + const numext::uint16_t s5 = numext::bit_cast(from[5*stride]); + const numext::uint16_t s6 = numext::bit_cast(from[6*stride]); + const numext::uint16_t s7 = numext::bit_cast(from[7*stride]); + return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); +} + +template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + EIGEN_ALIGN32 bfloat16 aux[8]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) { + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) { + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) { + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) { + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm_shuffle_epi8(a,m); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + __m128i e = kernel.packet[4]; + __m128i f = kernel.packet[5]; + __m128i g = kernel.packet[6]; + __m128i h = kernel.packet[7]; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0]; + __m128i b = kernel.packet[1]; + __m128i c = kernel.packet[2]; + __m128i d = kernel.packet[3]; + + __m128i ab_03 = _mm_unpacklo_epi16(a, b); + __m128i cd_03 = _mm_unpacklo_epi16(c, d); + __m128i ab_47 = _mm_unpackhi_epi16(a, b); + __m128i cd_47 = _mm_unpackhi_epi16(c, d); + + kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03); + kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03); + kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47); + kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h index 83bfdc60..d507fb67 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -35,15 +35,79 @@ struct type_casting_traits { }; +#ifndef EIGEN_VECTORIZE_AVX512 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +#endif // EIGEN_VECTORIZE_AVX512 template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvtps_epi32(a); + return _mm256_cvttps_epi32(a); } template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { return _mm256_cvtepi32_ps(a); } +template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { + return _mm256_castps_si256(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet8i& a) { + return _mm256_castsi256_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8bf& a) { + return Bf16ToF32(a); +} + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8f& a) { + return F32ToBf16(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/AVX512/Complex.h b/externals/eigen/Eigen/src/Core/arch/AVX512/Complex.h new file mode 100644 index 00000000..49c72b3f --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/AVX512/Complex.h @@ -0,0 +1,422 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX512_H +#define EIGEN_COMPLEX_AVX512_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet8cf +{ + EIGEN_STRONG_INLINE Packet8cf() {} + EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {} + __m512 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet8cf type; + typedef Packet4cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet4cf half; + typedef Packet16f as_real; + enum { + size = 8, + alignment=unpacket_traits::alignment, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) +{ + return Packet8cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) +{ + const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, + 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet8cf(pxor(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) +{ + __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { + __m512 eq = pcmp_eq(a.v, b.v); + return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) +{ + return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploaddup((const double*)(const void*)from )) ); +} +template<> EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) +{ + return Packet8cf( _mm512_castpd_ps( ploadquad((const double*)(const void*)from )) ); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather, Packet8cf>(const std::complex* from, Index stride) +{ + return Packet8cf(_mm512_castpd_ps(pgather((const double*)(const void*)from, stride))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet8cf>(std::complex* to, const Packet8cf& from, Index stride) +{ + pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet8cf& a) +{ + return pfirst(Packet2cf(_mm512_castps512_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { + return Packet8cf(_mm512_castsi512_ps( + _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), + _mm512_castps_si512(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) +{ + return predux(padd(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) +{ + return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), + Packet4cf(extract256<1>(a.v)))); +} + +template <> +EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { + __m256 lane0 = extract256<0>(a.v); + __m256 lane1 = extract256<1>(a.v); + __m256 res = _mm256_add_ps(lane0, lane1); + return Packet4cf(res); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) + +template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) +{ + Packet8cf num = pmul(a, pconj(b)); + __m512 tmp = _mm512_mul_ps(b.v, b.v); + __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); + __m512 denom = _mm512_add_ps(tmp, tmp2); + return Packet8cf(_mm512_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) +{ + return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet4cd +{ + EIGEN_STRONG_INLINE Packet4cd() {} + EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {} + __m512d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cd type; + typedef Packet2cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cd half; + typedef Packet8d as_real; + enum { + size = 4, + alignment = unpacket_traits::alignment, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd psub(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) +{ + const __m512d mask = _mm512_castsi512_pd( + _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, + 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet4cd(pxor(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) +{ + __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0); + __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF); + __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55); + __m512d odd = _mm512_mul_pd(tmp2, tmp3); + return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } + +template <> +EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { + __m512d eq = pcmp_eq(a.v, b.v); + return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) +{ + #ifdef EIGEN_VECTORIZE_AVX512DQ + return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); + #else + return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { + return Packet4cd(_mm512_insertf64x4( + _mm512_castpd256_pd512(ploaddup(from).v), ploaddup(from+1).v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) +{ + return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+0*stride).v), ploadu(from+1*stride).v,1)), + _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+2*stride).v), ploadu(from+3*stride).v,1), 1)); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) +{ + __m512i fromi = _mm512_castpd_si512(from.v); + double* tod = (double*)(void*)to; + _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); + _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); + _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); + _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); +} + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) +{ + __m128d low = extract128<0>(a.v); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { + return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) +{ + return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) +{ + return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), + Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); +} + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) + +template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) +{ + Packet4cd num = pmul(a, pconj(b)); + __m512d tmp = _mm512_mul_pd(b.v, b.v); + __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); + return Packet4cd(_mm512_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) +{ + return Packet4cd(_mm512_permute_pd(x.v,0x55)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock pb; + + pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); + pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); + pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); + pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); + pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); + pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); + pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); + pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); + ptranspose(pb); + kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); + kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); + kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); + kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); + kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); + kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); + kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); + kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1] + __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3] + __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1] + __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3] + + kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3] + kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2] + kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1] + kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] +} + +template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { + return psqrt_complex(a); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX512_H diff --git a/externals/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h index 399be0ee..6fd726d2 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -15,13 +15,13 @@ namespace Eigen { namespace internal { // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ - const Packet16f p16f_##NAME = (__m512)pset1(X) + const Packet16f p16f_##NAME = preinterpret(pset1(X)) #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ const Packet8d p8d_##NAME = pset1(X) @@ -29,100 +29,41 @@ namespace internal { #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -#if defined(EIGEN_VECTORIZE_AVX512DQ) +#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \ + const Packet16bf p16bf_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \ + const Packet16bf p16bf_##NAME = preinterpret(pset1(X)) + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f plog(const Packet16f& _x) { - Packet16f x = _x; - _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f); - - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000); - - // The smallest non denormalized float number. - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); - - // invalid_mask is set to true when x is NaN - __mmask16 invalid_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); - __mmask16 iszero_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, p16f_min_norm_pos); - - // Extract the shifted exponents. - Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); - Packet16f e = _mm512_sub_ps(emm0, p16f_126f); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm512_and_ps(x, p16f_inv_mant_mask); - x = _mm512_or_ps(x, p16f_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); - x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); - x = padd(x, tmp); - - Packet16f x2 = pmul(x, x); - Packet16f x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet16f y, y1, y2; - y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1); - y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4); - y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7); - y = pmadd(y, x, p16f_cephes_log_p2); - y1 = pmadd(y1, x, p16f_cephes_log_p5); - y2 = pmadd(y2, x, p16f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p16f_cephes_log_q1); - tmp = pmul(x2, p16f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p16f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, - _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); + return plog_float(_x); } -#endif + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +plog(const Packet8d& _x) { + return plog_double(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog) + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +plog2(const Packet16f& _x) { + return plog2_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +plog2(const Packet8d& _x) { + return plog2_double(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2) // Exponential function. Works by writing "x = m*log(2) + r" where // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then @@ -158,17 +99,17 @@ pexp(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); Packet16f r2 = pmul(r, r); + Packet16f r3 = pmul(r2, r); - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet16f y = p16f_cephes_exp_p0; - y = pmadd(y, r, p16f_cephes_exp_p1); - y = pmadd(y, r, p16f_cephes_exp_p2); - y = pmadd(y, r, p16f_cephes_exp_p3); - y = pmadd(y, r, p16f_cephes_exp_p4); - y = pmadd(y, r, p16f_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, p16f_1); + // Evaluate the polynomial approximant,improved by instruction-level parallelism. + Packet16f y, y1, y2; + y = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1); + y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4); + y2 = padd(r, p16f_1); + y = pmadd(y, r, p16f_cephes_exp_p2); + y1 = pmadd(y1, r, p16f_cephes_exp_p5); + y = pmadd(y, r3, y1); + y = pmadd(y, r2, y2); // Build emm0 = 2^m. Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); @@ -178,74 +119,40 @@ pexp(const Packet16f& _x) { return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); } -/*template <> +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d pexp(const Packet8d& _x) { - Packet8d x = _x; - - _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); - - _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); - - // clamp x - x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); - - // Express exp(x) as exp(g + n*log(2)). - const Packet8d n = - _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); - const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); - x = psub(x, nC1); - x = psub(x, nC2); - - const Packet8d x2 = pmul(x, x); + return pexp_double(_x); +} - // Evaluate the numerator polynomial of the rational interpolant. - Packet8d px = p8d_cephes_exp_p0; - px = pmadd(px, x2, p8d_cephes_exp_p1); - px = pmadd(px, x2, p8d_cephes_exp_p2); - px = pmul(px, x); +F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp) - // Evaluate the denominator polynomial of the rational interpolant. - Packet8d qx = p8d_cephes_exp_q0; - qx = pmadd(qx, x2, p8d_cephes_exp_q1); - qx = pmadd(qx, x2, p8d_cephes_exp_q2); - qx = pmadd(qx, x2, p8d_cephes_exp_q3); +template <> +EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) { + Packet16f fexponent; + const Packet16h out = float2half(pfrexp(half2float(a), fexponent)); + exponent = float2half(fexponent); + return out; +} - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm512_div_pd(px, psub(qx, px)); - x = pmadd(p8d_2, x, p8d_1); +template <> +EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) { + return float2half(pldexp(half2float(a), half2float(exponent))); +} - // Build e=2^n. - const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( - _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); +template <> +EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) { + Packet16f fexponent; + const Packet16bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, e), _x); - }*/ +template <> +EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); +} // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step @@ -257,138 +164,197 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); - - Packet16f neg_half = pmul(_x, p16f_minus_half); + Packet16f neg_half = pmul(_x, pset1(-.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), - _mm512_setzero_ps()); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); - - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d neg_half = pmul(_x, pset1(-.5)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), - _mm512_setzero_pd()); + Packet8d x = _mm512_rsqrt14_pd(_x); - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> EIGEN_STRONG_INLINE Packet16f psqrt(const Packet16f& x) { return _mm512_sqrt_ps(x); } + template <> EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { return _mm512_sqrt_pd(x); } #endif -// Functions for rsqrt. -// Almost identical to the sqrt routine, just leave out the last multiplication -// and fill in NaN/Inf where needed. Note that this function only exists as an -// iterative version for doubles since there is no instruction for diretly -// computing the reciprocal square root in AVX-512. -#ifdef EIGEN_FAST_MATH +F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt) + +// prsqrt for float. +#if defined(EIGEN_VECTORIZE_AVX512ER) + +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); +} +#elif EIGEN_FAST_MATH + template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f prsqrt(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); Packet16f neg_half = pmul(_x, p16f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), - _mm512_rsqrt14_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); - Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, p16f_nan, - _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + // Identity infinite, negative and denormal arguments. + __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); + __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); + __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; + + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. + Packet16f y_approx = _mm512_rsqrt14_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); +} +#else - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); + return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); } +#endif + +F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt) +// prsqrt for double. +#if EIGEN_FAST_MATH template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d prsqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); Packet8d neg_half = pmul(_x, p8d_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), - _mm512_rsqrt14_pd(_x)); + // Identity infinite, negative and denormal arguments. + __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); + __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); + __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; - // Fill in NaNs and Infs for the negative/zero entries. - __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); - Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, p8d_nan, - _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); + // Compute an approximate result using the rsqrt intrinsic, forcing +inf + // for denormals for consistency with AVX and SSE implementations. +#if defined(EIGEN_VECTORIZE_AVX512ER) + Packet8d y_approx = _mm512_rsqrt28_pd(_x); +#else + Packet8d y_approx = _mm512_rsqrt14_pd(_x); +#endif + // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the + // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). + // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number + // of correct digits for each step. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); +#if !defined(EIGEN_VECTORIZE_AVX512ER) + y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); +#endif + // Select the result of the Newton-Raphson step for positive finite arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. + return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); } #else template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - return _mm512_rsqrt28_ps(x); +EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { + _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); + return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); } #endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f plog1p(const Packet16f& _x) { + return generic_plog1p(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p) + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16f pexpm1(const Packet16f& _x) { + return generic_expm1(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) + #endif + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +psin(const Packet16f& _x) { + return psin_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +pcos(const Packet16f& _x) { + return pcos_float(_x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +ptanh(const Packet16f& _x) { + return internal::generic_fast_tanh_float(_x); +} + +F16_PACKET_FUNCTION(Packet16f, Packet16h, psin) +F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos) +F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh) + +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh) + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h index f6500a16..34d49ab6 100644 --- a/externals/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -19,10 +19,10 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -31,6 +31,8 @@ namespace internal { typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; +typedef eigen_packet_wrapper<__m256i, 1> Packet16h; +typedef eigen_packet_wrapper<__m256i, 2> Packet16bf; template <> struct is_arithmetic<__m512> { @@ -45,6 +47,51 @@ struct is_arithmetic<__m512d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 1, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 0, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasBessel = 1, + HasNdtri = 1 + }; +}; + template<> struct packet_traits : default_packet_traits { typedef Packet16f type; @@ -54,15 +101,32 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) -#ifdef EIGEN_VECTORIZE_AVX512DQ + + HasAbs = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasBlend = 0, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasLog = 1, -#endif + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasBessel = 1, HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, #endif - HasDiv = 1 + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -74,11 +138,18 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) - HasSqrt = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) + HasLog = 1, + HasExp = 1, + HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif - HasDiv = 1 + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1 }; }; @@ -98,19 +169,28 @@ template <> struct unpacket_traits { typedef float type; typedef Packet8f half; - enum { size = 16, alignment=Aligned64 }; + typedef Packet16i integer_packet; + typedef uint16_t mask_t; + enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64 }; + enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false }; +}; + +template<> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet8h half; + enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; }; template <> @@ -126,13 +206,40 @@ EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { return _mm512_set1_epi32(from); } +template <> +EIGEN_STRONG_INLINE Packet16f pset1frombits(unsigned int from) { + return _mm512_castsi512_ps(_mm512_set1_epi32(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet8d pset1frombits(const numext::uint64_t from) { + return _mm512_castsi512_pd(_mm512_set1_epi64(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); } + +template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) { + return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1)); +} +template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) { + return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, + 0, -1, 0, -1, 0, -1, 0, -1); +} +template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) { + return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, + 0, 0, -1, -1, 0, 0, -1, -1)); +} + template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { return _mm512_broadcastss_ps(_mm_load_ps1(from)); } template <> EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { - return _mm512_broadcastsd_pd(_mm_load_pd1(from)); + return _mm512_set1_pd(*from); } template <> @@ -158,6 +265,11 @@ EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, const Packet8d& b) { return _mm512_add_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, + const Packet16i& b) { + return _mm512_add_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, @@ -169,6 +281,11 @@ EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, const Packet8d& b) { return _mm512_sub_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, + const Packet16i& b) { + return _mm512_sub_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { @@ -202,6 +319,11 @@ EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, const Packet8d& b) { return _mm512_mul_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, + const Packet16i& b) { + return _mm512_mullo_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, @@ -214,7 +336,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, return _mm512_div_pd(a, b); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) { @@ -227,52 +349,217 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, } #endif +template <> +EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, + const Packet16f& a, + const Packet16f& b) { + __mmask16 mask16 = _mm512_cmp_epi32_mask( + _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_ps(mask16, a, b); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, + const Packet8d& a, + const Packet8d& b) { + __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), + _mm512_setzero_epi32(), _MM_CMPINT_EQ); + return _mm512_mask_blend_pd(mask8, a, b); +} + template <> EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { - return _mm512_min_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { - return _mm512_min_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_pd(b, a); } template <> EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { - return _mm512_max_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { - return _mm512_max_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_pd(b, a); } -template <> -EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, - const Packet16f& b) { +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + + #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_and_ps(a, b); +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); +} - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); +// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); +} + +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { + return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), + _mm256_castps_si256(b),1)); +} +#endif + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 32x16 to 16x16. +EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { + // Split data into small pieces and handle with AVX instructions + // to guarantee internal order of vector. + // Operation: + // dst[15:0] := Saturate16(rf[31:0]) + // dst[31:16] := Saturate16(rf[63:32]) + // ... + // dst[255:240] := Saturate16(rf[255:224]) + __m256i lo = _mm256_castps_si256(extract256<0>(rf)); + __m256i hi = _mm256_castps_si256(extract256<1>(rf)); + __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), + _mm256_extractf128_si256(lo, 1)); + __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), + _mm256_extractf128_si256(hi, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} +template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_ps( + _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); +} + +template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { + __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); + return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); +} - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) { + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ); + return _mm512_castsi512_pd( + _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); +} - return res; +template<> EIGEN_STRONG_INLINE Packet16f print(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8d print(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); } + +template<> EIGEN_STRONG_INLINE Packet16f pceil(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); } +template<> EIGEN_STRONG_INLINE Packet8d pceil(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); } + +template<> EIGEN_STRONG_INLINE Packet16f pfloor(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); } +template<> EIGEN_STRONG_INLINE Packet8d pfloor(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); } + +template <> +EIGEN_STRONG_INLINE Packet16i ptrue(const Packet16i& /*a*/) { + return _mm512_set1_epi32(0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) { + return _mm512_castsi512_ps(ptrue(_mm512_castps_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8d ptrue(const Packet8d& a) { + return _mm512_castsi512_pd(ptrue(_mm512_castpd_si512(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, + const Packet16i& b) { + return _mm512_and_si512(a,b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_and_ps(a, b); +#else + return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> @@ -288,35 +575,21 @@ EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); #endif } + +template <> +EIGEN_STRONG_INLINE Packet16i por(const Packet16i& a, const Packet16i& b) { + return _mm512_or_si512(a, b); +} + template <> -EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } @@ -326,107 +599,80 @@ EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pxor(const Packet16i& a, const Packet16i& b) { + return _mm512_xor_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } + template <> -EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pandnot(const Packet16i& a, const Packet16i& b) { + return _mm512_andnot_si512(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_ps(a, b); + return _mm512_andnot_ps(b, a); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(a, b); + return _mm512_andnot_pd(b, a); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); + return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); +#endif +} - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); +template<> EIGEN_STRONG_INLINE Packet16f pround(const Packet16f& a) +{ + // Work-around for default std::round rounding mode. + const Packet16f mask = pset1frombits(static_cast(0x80000000u)); + const Packet16f prev0dot5 = pset1frombits(static_cast(0x3EFFFFFFu)); + return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} +template<> EIGEN_STRONG_INLINE Packet8d pround(const Packet8d& a) +{ + // Work-around for default std::round rounding mode. + const Packet8d mask = pset1frombits(static_cast(0x8000000000000000ull)); + const Packet8d prev0dot5 = pset1frombits(static_cast(0x3FDFFFFFFFFFFFFFull)); + return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} - return res; -#endif +template EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) { + return _mm512_srai_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) { + return _mm512_srli_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) { + return _mm512_slli_epi32(a, N); } template <> @@ -457,79 +703,65 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { reinterpret_cast(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); +} + // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { - Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane0 = _mm256_blend_ps( - lane0, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2)); - - Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4)); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane1 = _mm256_blend_ps( - lane1, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2)); + // an unaligned load is required here as there is no requirement + // on the alignment of input pointer 'from' + __m256i low_half = _mm256_loadu_si256(reinterpret_cast(from)); + __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); + __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); + return pairs; +} #ifdef EIGEN_VECTORIZE_AVX512DQ - Packet16f res = _mm512_undefined_ps(); - return _mm512_insertf32x8(res, lane0, 0); - return _mm512_insertf32x8(res, lane1, 1); - return res; -#else - Packet16f res = _mm512_undefined_ps(); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3); - return res; -#endif -} +// FIXME: this does not look optimal, better load a Packet4d and shuffle... // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); - lane0 = _mm256_permute_pd(lane0, 3 << 2); - - Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); - lane1 = _mm256_permute_pd(lane1, 3 << 2); - - Packet8d res = _mm512_undefined_pd(); - res = _mm512_insertf64x4(res, lane0, 0); - return _mm512_insertf64x4(res, lane1, 1); + __m512d x = _mm512_setzero_pd(); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); + return x; +} +#else +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); + return x; } +#endif // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_undefined_ps(); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); - return tmp; + Packet16f tmp = _mm512_castps128_ps512(ploadu(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_ps(scatter_mask, tmp); } + // Loads 2 doubles from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1} template <> EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - Packet8d tmp = _mm512_undefined_pd(); - Packet2d tmp0 = _mm_load_pd1(from); - Packet2d tmp1 = _mm_load_pd1(from + 1); - Packet4d lane0 = _mm256_broadcastsd_pd(tmp0); - Packet4d lane1 = _mm256_broadcastsd_pd(tmp1); + __m256d lane0 = _mm256_set1_pd(*from); + __m256d lane1 = _mm256_set1_pd(*(from+1)); + __m512d tmp = _mm512_undefined_pd(); tmp = _mm512_insertf64x4(tmp, lane0, 0); return _mm512_insertf64x4(tmp, lane1, 1); } @@ -561,11 +793,16 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( reinterpret_cast<__m512i*>(to), from); } +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16_t umask) { + __mmask16 mask = static_cast<__mmask16>(umask); + EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from); +} template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -575,7 +812,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -586,7 +823,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -596,7 +833,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -618,9 +855,9 @@ EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { pstore(to, pa); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { @@ -648,20 +885,73 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff))); } template <> EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512d)_mm512_and_si512((__m512i)a, - _mm512_set1_epi64(0x7fffffffffffffff)); + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), + _mm512_set1_epi64(0x7fffffffffffffff))); +} + +template<> +EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& exponent){ + return pfrexp_generic(a, exponent); +} + +// Extract exponent without existence of Packet8l. +template<> +EIGEN_STRONG_INLINE +Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) { + const Packet8d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + #ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)); + #else + return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52))); + #endif +} + +template<> +EIGEN_STRONG_INLINE Packet8d pfrexp(const Packet8d& a, Packet8d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet16f pldexp(const Packet16f& a, const Packet16f& exponent) { + return pldexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet8d max_exponent = pset1(2099.0); + const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet8i bias = pset1(1023); + Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); + Packet8i lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); + lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } #ifdef EIGEN_VECTORIZE_AVX512DQ // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ - __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ - _mm512_extractf32x8_ps(INPUT, 1) + __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ + __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) #else #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ @@ -674,258 +964,64 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_undefined_ps(); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); #endif -template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* -vecs) -{ - EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); - EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); - EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); - EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); - EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); - EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); - EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); - EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); - EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); - EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); - EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); - EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); - EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); - EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); - EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); - EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); - - __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); - __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); - __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); - __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); - - __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); - __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); - __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); - __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - __m256 sum1 = _mm256_add_ps(perm1, hsum5); - __m256 sum2 = _mm256_add_ps(perm2, hsum6); - __m256 sum3 = _mm256_add_ps(perm3, hsum7); - __m256 sum4 = _mm256_add_ps(perm4, hsum8); - - __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); - hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); - hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); - hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); - - hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); - hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); - hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); - hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); - - hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); - hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); - hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); - hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); - - hsum5 = _mm256_hadd_ps(hsum1, hsum1); - hsum6 = _mm256_hadd_ps(hsum2, hsum2); - hsum7 = _mm256_hadd_ps(hsum3, hsum3); - hsum8 = _mm256_hadd_ps(hsum4, hsum4); - - perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); - perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); - perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); - perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); - - sum1 = _mm256_add_ps(perm1, hsum5); - sum2 = _mm256_add_ps(perm2, hsum6); - sum3 = _mm256_add_ps(perm3, hsum7); - sum4 = _mm256_add_ps(perm4, hsum8); - - blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); - blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); - - final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); - - __m512 final_output; - - EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); - return final_output; -} - -template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) -{ - Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); - Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); - - Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); - Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); - - Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); - Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); - - Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); - Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); - - Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); - Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); - - Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); - Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); - - Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); - Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); - - Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); - Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); - - Packet4d tmp0, tmp1; - - tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); - - tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); - tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); - - tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); - tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); - - final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); - - return _mm512_insertf64x4(final_output, final_1, 1); -} template <> EIGEN_STRONG_INLINE float predux(const Packet16f& a) { - //#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f sum = padd(lane0, lane1); - Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1)); - tmp0 = _mm256_hadd_ps(tmp0, tmp0); - return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f x = _mm256_add_ps(lane0, lane1); + return predux(x); #else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3)); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); sum = _mm_hadd_ps(sum, sum); sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); - return pfirst(sum); + return _mm_cvtss_f32(sum); #endif } template <> EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d sum = padd(lane0, lane1); - Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); - return pfirst(_mm256_hadd_pd(tmp0, tmp0)); + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d sum = _mm256_add_pd(lane0, lane1); + __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); + return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); } template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - return padd(lane0, lane1); + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + return _mm256_add_ps(lane0, lane1); #else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum0 = padd(lane0, lane2); - Packet4f sum1 = padd(lane1, lane3); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum0 = _mm_add_ps(lane0, lane2); + __m128 sum1 = _mm_add_ps(lane1, lane3); return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = padd(lane0, lane1); - return res; +EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + return _mm256_add_pd(lane0, lane1); } template <> @@ -939,108 +1035,70 @@ EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); #else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); #endif } template <> EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = pmul(lane0, lane1); + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = pmul(lane0, lane1); res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); } template <> EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); } template <> EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_min_pd(lane0, lane1); + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_min_pd(lane0, lane1); res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); } + template <> EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_max_pd(lane0, lane1); + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_max_pd(lane0, lane1); res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); - - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); - - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); - - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); - } - } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); - } - } -}; +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) +{ + Packet16i xi = _mm512_castps_si512(x); + __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); + return !_mm512_kortestz(tmp,tmp); +} + #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ @@ -1302,11 +1360,940 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, return Packet16f(); } template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, - const Packet8d& /*thenPacket*/, - const Packet8d& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet8d(); +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, + const Packet8d& thenPacket, + const Packet8d& elsePacket) { + __mmask8 m = (ifPacket.select[0] ) + | (ifPacket.select[1]<<1) + | (ifPacket.select[2]<<2) + | (ifPacket.select[3]<<3) + | (ifPacket.select[4]<<4) + | (ifPacket.select[5]<<5) + | (ifPacket.select[6]<<6) + | (ifPacket.select[7]<<7); + return _mm512_mask_blend_pd(m, elsePacket, thenPacket); +} + +// Packet math for Eigen::half +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + return _mm256_set1_epi16(from.x); +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + return _mm256_load_si256(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + return _mm256_loadu_si256(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploaddup(const Eigen::half* from) { + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + return _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { + return ptrue(Packet8i(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) { + const __m256i sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_andnot_si256(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmin(const Packet16h& a, + const Packet16h& b) { + return float2half(pmin(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h pmax(const Packet16h& a, + const Packet16h& b) { + return float2half(pmax(half2float(a), half2float(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16h plset(const half& a) { + return float2half(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { + // in some cases Packet8i is a wrapper around __m256i, so we need to + // cast to Packet8i to call the correct overload. + return por(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { + return pxor(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { + return pand(Packet8i(a),Packet8i(b)); +} +template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { + return pandnot(Packet8i(a),Packet8i(b)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { + return _mm256_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet16h pround(const Packet16h& a) { + return float2half(pround(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h print(const Packet16h& a) { + return float2half(print(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pceil(const Packet16h& a) { + return float2half(pceil(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pfloor(const Packet16h& a) { + return float2half(pfloor(half2float(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + return Pack32To16(pcmp_eq(af, bf)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_le(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_lt(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) { + return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + Packet16h sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_xor_si256(a, sign_mask); +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pdiv(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template <> +EIGEN_STRONG_INLINE Packet8h predux_half_dowto4(const Packet16h& a) { + Packet8h lane0 = _mm256_extractf128_si256(a, 0); + Packet8h lane1 = _mm256_extractf128_si256(a, 1); + return padd(lane0, lane1); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet16h& a) { + Packet16f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet16h& a) { + Packet16f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + return _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + return _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; + to[stride*8] = aux[8]; + to[stride*9] = aux[9]; + to[stride*10] = aux[10]; + to[stride*11] = aux[11]; + to[stride*12] = aux[12]; + to[stride*13] = aux[13]; + to[stride*14] = aux[14]; + to[stride*15] = aux[15]; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + __m256i e = kernel.packet[4]; + __m256i f = kernel.packet[5]; + __m256i g = kernel.packet[6]; + __m256i h = kernel.packet[7]; + __m256i i = kernel.packet[8]; + __m256i j = kernel.packet[9]; + __m256i k = kernel.packet[10]; + __m256i l = kernel.packet[11]; + __m256i m = kernel.packet[12]; + __m256i n = kernel.packet[13]; + __m256i o = kernel.packet[14]; + __m256i p = kernel.packet[15]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0] = a_p_0; + kernel.packet[1] = a_p_1; + kernel.packet[2] = a_p_2; + kernel.packet[3] = a_p_3; + kernel.packet[4] = a_p_4; + kernel.packet[5] = a_p_5; + kernel.packet[6] = a_p_6; + kernel.packet[7] = a_p_7; + kernel.packet[8] = a_p_8; + kernel.packet[9] = a_p_9; + kernel.packet[10] = a_p_a; + kernel.packet[11] = a_p_b; + kernel.packet[12] = a_p_c; + kernel.packet[13] = a_p_d; + kernel.packet[14] = a_p_e; + kernel.packet[15] = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + +template <> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16bf type; + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + HasBlend = 0, + HasInsert = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#ifdef EIGEN_VECTORIZE_AVX512DQ + HasLog = 1, // Currently fails test with bad accuracy. + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasBessel = 1, +#endif + HasExp = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, +#endif + HasCmp = 1, + HasDiv = 1 + }; +}; + +template <> +struct unpacket_traits +{ + typedef bfloat16 type; + enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet8bf half; +}; + +template <> +EIGEN_STRONG_INLINE Packet16bf pset1(const bfloat16& from) { + return _mm256_set1_epi16(from.value); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet16bf& from) { + bfloat16 t; + t.value = static_cast(_mm256_extract_epi16(from, 0)); + return t; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pload(const bfloat16* from) { + return _mm256_load_si256(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf ploadu(const bfloat16* from) { + return _mm256_loadu_si256(reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE void pstore(bfloat16* to, + const Packet16bf& from) { + _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, + const Packet16bf& from) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet16bf +ploaddup(const bfloat16* from) { + Packet16bf r; + unsigned short a = from[0].value; + unsigned short b = from[1].value; + unsigned short c = from[2].value; + unsigned short d = from[3].value; + unsigned short e = from[4].value; + unsigned short f = from[5].value; + unsigned short g = from[6].value; + unsigned short h = from[7].value; + return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); +} + +template<> EIGEN_STRONG_INLINE Packet16bf +ploadquad(const bfloat16* from) { + Packet16bf r; + unsigned short a = from[0].value; + unsigned short b = from[1].value; + unsigned short c = from[2].value; + unsigned short d = from[3].value; + return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); +} + +EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) { + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16)); +} + +// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm. +EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { + Packet16bf r; + +#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) + // Since GCC 10.1 supports avx512bf16 and C style explicit cast + // (C++ static_cast is not supported yet), do converion via intrinsic + // and register path for performance. + r = (__m256i)(_mm512_cvtneps_pbh(a)); + +#else + __m512i t; + __m512i input = _mm512_castps_si512(a); + __m512i nan = _mm512_set1_epi32(0x7fc0); + + // uint32_t lsb = (input >> 16) & 1; + t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1)); + // uint32_t rounding_bias = 0x7fff + lsb; + t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff)); + // input += rounding_bias; + t = _mm512_add_epi32(t, input); + // input = input >> 16; + t = _mm512_srli_epi32(t, 16); + + // Check NaN before converting back to bf16 + __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + + t = _mm512_mask_blend_epi32(mask, nan, t); + // output.value = static_cast(input); + r = _mm512_cvtepi32_epi16(t); +#endif // EIGEN_VECTORIZE_AVX512BF16 + + return r; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) { + return ptrue(a); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) { + return por(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) { + return pxor(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) { + return pand(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, + const Packet16bf& b) { + return pandnot(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, + const Packet16bf& a, + const Packet16bf& b) { + // Input mask is expected to be all 0/1, handle it with 8-bit + // intrinsic for performance. + return _mm256_blendv_epi8(b, a, mask); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pround(const Packet16bf& a) +{ + return F32ToBf16(pround(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf print(const Packet16bf& a) { + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pceil(const Packet16bf& a) { + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet16bf pfloor(const Packet16bf& a) { + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, + const Packet16bf& b) { + return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) { + Packet16bf sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_xor_si256(a, sign_mask); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) { + const __m256i sign_mask = _mm256_set1_epi16(static_cast(0x8000)); + return _mm256_andnot_si256(sign_mask, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf padd(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf psub(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmul(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pdiv(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmin(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pmax(const Packet16bf& a, + const Packet16bf& b) { + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf plset(const bfloat16& a) { + return F32ToBf16(plset(static_cast(a))); +} + +template <> +EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4(const Packet16bf& a) { + Packet8bf lane0 = _mm256_extractf128_si256(a, 0); + Packet8bf lane1 = _mm256_extractf128_si256(a, 1); + return padd(lane0, lane1); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& p) { + return static_cast(predux(Bf16ToF32(p))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) { + return static_cast(predux_mul(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) { + return static_cast(predux_min(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) { + return static_cast(predux_max(Bf16ToF32(from))); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) { + __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1, + 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + + Packet16bf res; + // Swap hi and lo first because shuffle is in 128-bit lanes. + res = _mm256_permute2x128_si256(a, a, 1); + // Shuffle 8-bit values in src within 2*128-bit lanes. + return _mm256_shuffle_epi8(res, m); +} + +template <> +EIGEN_STRONG_INLINE Packet16bf pgather(const bfloat16* from, + Index stride) { + return _mm256_set_epi16( + from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value, + from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value, + from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value, + from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value); +} + +template <> +EIGEN_STRONG_INLINE void pscatter(bfloat16* to, + const Packet16bf& from, + Index stride) { + EIGEN_ALIGN64 bfloat16 aux[16]; + pstore(aux, from); + to[stride*0] = aux[0]; + to[stride*1] = aux[1]; + to[stride*2] = aux[2]; + to[stride*3] = aux[3]; + to[stride*4] = aux[4]; + to[stride*5] = aux[5]; + to[stride*6] = aux[6]; + to[stride*7] = aux[7]; + to[stride*8] = aux[8]; + to[stride*9] = aux[9]; + to[stride*10] = aux[10]; + to[stride*11] = aux[11]; + to[stride*12] = aux[12]; + to[stride*13] = aux[13]; + to[stride*14] = aux[14]; + to[stride*15] = aux[15]; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + __m256i e = kernel.packet[4]; + __m256i f = kernel.packet[5]; + __m256i g = kernel.packet[6]; + __m256i h = kernel.packet[7]; + __m256i i = kernel.packet[8]; + __m256i j = kernel.packet[9]; + __m256i k = kernel.packet[10]; + __m256i l = kernel.packet[11]; + __m256i m = kernel.packet[12]; + __m256i n = kernel.packet[13]; + __m256i o = kernel.packet[14]; + __m256i p = kernel.packet[15]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0]; + __m256i b = kernel.packet[1]; + __m256i c = kernel.packet[2]; + __m256i d = kernel.packet[3]; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20); + kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20); + kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31); + kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h new file mode 100644 index 00000000..33041272 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_AVX512_H +#define EIGEN_TYPE_CASTING_AVX512_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return _mm512_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return _mm512_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { + return _mm512_castps_si512(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { + return _mm512_castsi512_ps(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16bf& a) { + return Bf16ToF32(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16bf pcast(const Packet16f& a) { + return F32ToBf16(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_AVX512_H diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/Complex.h index 67db2f8e..f424f11c 100644 --- a/externals/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/Complex.h @@ -29,8 +29,54 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P //---------- float ---------- struct Packet2cf { - EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} + EIGEN_STRONG_INLINE explicit Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) + { + Packet4f v1, v2; + + // Permute and multiply the real parts of a and b + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + // Get the imaginary parts of a + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + // multiply a_re * b + v1 = vec_madd(v1, b.v, p4f_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + // permute back to a proper order + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + + return Packet2cf(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + v = pmul(Packet2cf(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(-v); + } + Packet4f v; }; @@ -38,6 +84,7 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; typedef Packet2cf half; + typedef Packet4f as_real; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -60,7 +107,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -80,16 +127,35 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +{ + Packet4f res0, res1; +#ifdef __VSX__ + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); +#ifdef _BIG_ENDIAN + __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); +#else + __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); +#endif +#else + *reinterpret_cast *>(&res0) = *from0; + *reinterpret_cast *>(&res1) = *from1; + res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); +#endif + return Packet2cf(res0); +} + template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -100,25 +166,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f v1, v2; - - // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - // multiply a_re * b - v1 = vec_madd(v1, b.v, p4f_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); - // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - - return Packet2cf(padd(v1, v2)); -} - template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v, b.v)); } @@ -128,7 +175,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore((float *)&res, a.v); return res[0]; @@ -149,22 +196,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(b)); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - Packet4f b1, b2; -#ifdef _BIG_ENDIAN - b1 = vec_sld(vecs[0].v, vecs[1].v, 8); - b2 = vec_sld(vecs[1].v, vecs[0].v, 8); -#else - b1 = vec_sld(vecs[1].v, vecs[0].v, 8); - b2 = vec_sld(vecs[0].v, vecs[1].v, 8); -#endif - b2 = vec_sld(b2, b2, 8); - b2 = padd(b1, b2); - - return Packet2cf(b2); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { Packet4f b; @@ -175,77 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { -#ifdef _BIG_ENDIAN - first.v = vec_sld(first.v, second.v, 8); -#else - first.v = vec_sld(second.v, first.v, 8); -#endif - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } @@ -262,6 +228,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); + return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV))); +} + #ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { Packet2cf result; @@ -270,12 +241,62 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con } #endif +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) +{ + return psqrt_complex(a); +} + //---------- double ---------- #ifdef __VSX__ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) + { + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); + v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); + + return Packet1cd(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + v = pmul(Packet1cd(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(-v); + } + Packet2d v; }; @@ -283,6 +304,7 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; + typedef Packet2d as_real; enum { Vectorizable = 1, AlignedOnScalar = 0, @@ -302,7 +324,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } @@ -312,19 +334,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex< template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - pstore >(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + pstore >(to, from); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } @@ -332,24 +348,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, con template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast(p2ul_CONJ_XOR2))); } -template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) -{ - Packet2d a_re, a_im, v1, v2; - - // Permute and multiply the real parts of a and b - a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); - // Get the imaginary parts of a - a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); - // multiply a_re * b - v1 = vec_madd(a_re, b.v, p2d_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(a_im, b.v, p2d_ZERO); - v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); - v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); - - return Packet1cd(padd(v1, v2)); -} - template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); } @@ -361,7 +359,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -370,74 +368,15 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); } @@ -453,6 +392,23 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } + +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = reinterpret_cast(vec_sld(reinterpret_cast(eq), reinterpret_cast(eq), 8)); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(vec_and(eq, eq_swapped)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) +{ + return psqrt_complex(a); +} + #endif // __VSX__ } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h index c5e4bede..3a7a3293 100644 --- a/externals/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,10 +9,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H @@ -20,180 +16,28 @@ namespace Eigen { namespace internal { -static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); -static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); -static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -static _EIGEN_DECLARE_CONST_Packet4i(23, 23); - -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - -/* the smallest non denormalized float number */ -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); - -/* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 -*/ -static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - -static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - -#ifdef __VSX__ -static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); -static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); -static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - -static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); -static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); -static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - -#ifdef __POWER8_VECTOR__ -static Packet2l p2l_1023 = { 1023, 1023 }; -static Packet2ul p2ul_52 = { 52, 52 }; -#endif - -#endif - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - Packet4f x = _x; - - Packet4i emm0; - - /* isvalid_mask is 0 if x < 0 or x is NaN. */ - Packet4ui isvalid_mask = reinterpret_cast(vec_cmpge(x, p4f_ZERO)); - Packet4ui iszero_mask = reinterpret_cast(vec_cmpeq(x, p4f_ZERO)); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = vec_sr(reinterpret_cast(x), - reinterpret_cast(p4i_23)); - - /* keep only the fractional part */ - x = pand(x, p4f_inv_mant_mask); - x = por(x, p4f_half); - - emm0 = psub(emm0, p4i_0x7f); - Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = reinterpret_cast(vec_cmplt(x, p4f_cephes_SQRTHF)); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - x = vec_sel(x, p4f_minus_inf, iszero_mask); - x = vec_sel(p4f_minus_nan, x, isvalid_mask); - return x; + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - Packet4f x = _x; - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - // express exp(x) as exp(g + n*log(2)) - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); + return pexp_float(_x); +} - // build 2^n - emm0 = vec_cts(fx, 0); - emm0 = vec_add(emm0, p4i_0x7f); - emm0 = vec_sl(emm0, reinterpret_cast(p4i_23)); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& _x) +{ + return psin_float(_x); +} - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), - isnumber_mask); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& _x) +{ + return pcos_float(_x); } #ifndef EIGEN_COMP_CLANG @@ -225,95 +69,19 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -// VSX support varies between different compilers and even different -// versions of the same compiler. For gcc version >= 4.9.3, we can use -// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use -// a slow version that works with older compilers. -// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles -// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 -static inline Packet2l ConvertToPacket2l(const Packet2d& x) { -#if EIGEN_GNUC_AT_LEAST(5, 4) || \ - (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) - return vec_cts(x, 0); // TODO: check clang version. -#else - double tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2l l = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return l; -#endif -} - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { - Packet2d x = _x; - - Packet2d tmp, fx; - Packet2l emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = ConvertToPacket2l(fx); - -#ifdef __POWER8_VECTOR__ - emm0 = vec_add(emm0, p2l_1023); - emm0 = vec_sl(emm0, p2ul_52); -#else - // Code is a bit complex for POWER7. There is actually a - // vec_xxsldi intrinsic but it is not supported by some gcc versions. - // So we shift (52-32) bits and do a word swap with zeros. - _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); - _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32 - - Packet4i emm04i = reinterpret_cast(emm0); - emm04i = vec_add(emm04i, p4i_1023); - emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); - static const Packet16uc perm = { - 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, - 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; -#ifdef _BIG_ENDIAN - emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); -#else - emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); -#endif - + return pexp_double(_x); +} #endif - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), - isnumber_mask); +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); } -#endif } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h new file mode 100644 index 00000000..3f79b97d --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -0,0 +1,2937 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H + +#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK +#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1 +#endif + +#include "MatrixProductCommon.h" + +// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX +#if EIGEN_COMP_LLVM +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) +#ifdef __MMA__ +#define EIGEN_ALTIVEC_MMA_ONLY +#else +#define EIGEN_ALTIVEC_DISABLE_MMA +#endif +#endif +#endif + +#ifdef __has_builtin +#if __has_builtin(__builtin_mma_assemble_acc) + #define ALTIVEC_MMA_SUPPORT +#endif +#endif + +#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #include "MatrixProductMMA.h" +#endif + +/************************************************************************************************** + * TODO * + * - Check StorageOrder on dhs_pack (the innermost second loop seems unvectorized when it could). * + * - Check the possibility of transposing as GETREAL and GETIMAG when needed. * + **************************************************************************************************/ +namespace Eigen { + +namespace internal { + +/************************** + * Constants and typedefs * + **************************/ +template +struct quad_traits +{ + typedef typename packet_traits::type vectortype; + typedef PacketBlock type; + typedef vectortype rhstype; + enum + { + vectorsize = packet_traits::size, + size = 4, + rows = 4 + }; +}; + +template<> +struct quad_traits +{ + typedef Packet2d vectortype; + typedef PacketBlock type; + typedef PacketBlock rhstype; + enum + { + vectorsize = packet_traits::size, + size = 2, + rows = 4 + }; +}; + +// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out +// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then +// are responsible to extract from convert between Eigen's and MatrixProduct approach. + +const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3, + 8, 9, 10, 11, + 16, 17, 18, 19, + 24, 25, 26, 27}; + +const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7, + 12, 13, 14, 15, + 20, 21, 22, 23, + 28, 29, 30, 31}; +const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23}; + +//[a,ai],[b,bi] = [ai,bi] +const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31}; + +/********************************************* + * Single precision real and complex packing * + * *******************************************/ + +/** + * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves + * the diagonal real, whatever is below it is copied from the respective upper diagonal element and + * conjugated. There's no PanelMode available for symm packing. + * + * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using + * its respective rank-update instructions. The float32/64 versions are different because at this moment + * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements. + * + * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has + * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main + * reason why packing for complex is broken down into several different parts, also the reason why we endup having a + * float32/64 and complex float32/64 version. + **/ +template +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +{ + std::complex v; + if(i < j) + { + v.real( dt(j,i).real()); + v.imag(-dt(j,i).imag()); + } else if(i > j) + { + v.real( dt(i,j).real()); + v.imag( dt(i,j).imag()); + } else { + v.real( dt(i,j).real()); + v.imag((Scalar)0.0); + } + return v; +} + +template +EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) +{ + const Index depth = k2 + rows; + const_blas_data_mapper, Index, StorageOrder> rhs(_rhs, rhsStride); + const Index vectorSize = N*quad_traits::vectorsize; + const Index vectorDelta = vectorSize * rows; + Scalar* blockBf = reinterpret_cast(blockB); + + Index rir = 0, rii, j = 0; + for(; j + vectorSize <= cols; j+=vectorSize) + { + rii = rir + vectorDelta; + + for(Index i = k2; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + std::complex v = getAdjointVal(i, j + k, rhs); + + blockBf[rir + k] = v.real(); + blockBf[rii + k] = v.imag(); + } + rir += vectorSize; + rii += vectorSize; + } + + rir += vectorDelta; + } + if (j < cols) + { + rii = rir + ((cols - j) * rows); + + for(Index i = k2; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + std::complex v = getAdjointVal(i, k, rhs); + + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); + + rir += 1; + rii += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) +{ + const Index depth = cols; + const_blas_data_mapper, Index, StorageOrder> lhs(_lhs, lhsStride); + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * depth; + Scalar* blockAf = (Scalar *)(blockA); + + Index rir = 0, rii, j = 0; + for(; j + vectorSize <= rows; j+=vectorSize) + { + rii = rir + vectorDelta; + + for(Index i = 0; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + std::complex v = getAdjointVal(j+k, i, lhs); + + blockAf[rir + k] = v.real(); + blockAf[rii + k] = v.imag(); + } + rir += vectorSize; + rii += vectorSize; + } + + rir += vectorDelta; + } + + if (j < rows) + { + rii = rir + ((rows - j) * depth); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + std::complex v = getAdjointVal(k, i, lhs); + + blockAf[rir] = v.real(); + blockAf[rii] = v.imag(); + + rir += 1; + rii += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) +{ + const Index depth = k2 + rows; + const_blas_data_mapper rhs(_rhs, rhsStride); + const Index vectorSize = quad_traits::vectorsize; + + Index ri = 0, j = 0; + for(; j + N*vectorSize <= cols; j+=N*vectorSize) + { + Index i = k2; + for(; i < depth; i++) + { + for(Index k = 0; k < N*vectorSize; k++) + { + if(i <= j+k) + blockB[ri + k] = rhs(j+k, i); + else + blockB[ri + k] = rhs(i, j+k); + } + ri += N*vectorSize; + } + } + + if (j < cols) + { + for(Index i = k2; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + if(k <= i) + blockB[ri] = rhs(i, k); + else + blockB[ri] = rhs(k, i); + ri += 1; + } + } + } +} + +template +EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) +{ + const Index depth = cols; + const_blas_data_mapper lhs(_lhs, lhsStride); + const Index vectorSize = quad_traits::vectorsize; + + Index ri = 0, j = 0; + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + for(; i < depth; i++) + { + for(Index k = 0; k < vectorSize; k++) + { + if(i <= j+k) + blockA[ri + k] = lhs(j+k, i); + else + blockA[ri + k] = lhs(i, j+k); + } + ri += vectorSize; + } + } + + if (j < rows) + { + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if(i <= k) + blockA[ri] = lhs(k, i); + else + blockA[ri] = lhs(i, k); + ri += 1; + } + } + } +} + +template +struct symm_pack_rhs, Index, nr, StorageOrder> +{ + void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> +{ + void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack std::complex *********** + +template +struct symm_pack_rhs, Index, nr, StorageOrder> +{ + void operator()(std::complex* blockB, const std::complex* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_complex_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs, Index, Pack1, Pack2_dummy, StorageOrder> +{ + void operator()(std::complex* blockA, const std::complex* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_complex_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack float32 *********** +template +struct symm_pack_rhs +{ + void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs +{ + void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +// *********** symm_pack float64 *********** +template +struct symm_pack_rhs +{ + void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + symm_pack_rhs_helper(blockB, _rhs, rhsStride, rows, cols, k2); + } +}; + +template +struct symm_pack_lhs +{ + void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) + { + symm_pack_lhs_helper(blockA, _lhs, lhsStride, cols, rows); + } +}; + +/** + * PanelMode + * Packing might be called several times before being multiplied by gebp_kernel, this happens because + * on special occasions it fills part of block with other parts of the matrix. Two variables control + * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever + * is going to be the real offset and stride in the future and this is what you should obey. The process + * is to behave as you would with normal packing but leave the start of each part with the correct offset + * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride + * and offset and behaves accordingly. + **/ + +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +{ + const Index size = 16 / sizeof(Scalar); + pstore(to + (0 * size), block.packet[0]); + pstore(to + (1 * size), block.packet[1]); + pstore(to + (2 * size), block.packet[2]); + pstore(to + (3 * size), block.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +{ + const Index size = 16 / sizeof(Scalar); + pstore(to + (0 * size), block.packet[0]); + pstore(to + (1 * size), block.packet[1]); +} + +// General template for lhs & rhs complex packing. +template +struct dhs_cpack { + EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii; + Scalar* blockAt = reinterpret_cast(blockA); + Index j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if (UseLhs) { + bload(cblock, lhs, j, i); + } else { + bload(cblock, lhs, i, j); + } + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); + blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32); + blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32); + blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32); + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32); + blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32); + blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32); + blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + blocki.packet[2] = -blocki.packet[2]; + blocki.packet[3] = -blocki.packet[3]; + } + + if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs))) + { + ptranspose(blockr); + ptranspose(blocki); + } + + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); + + rir += 4*vectorSize; + rii += 4*vectorSize; + } + for(; i < depth; i++) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) + { + if (UseLhs) { + cblock.packet[0] = lhs.template loadPacket(j + 0, i); + cblock.packet[1] = lhs.template loadPacket(j + 2, i); + } else { + cblock.packet[0] = lhs.template loadPacket(i, j + 0); + cblock.packet[1] = lhs.template loadPacket(i, j + 2); + } + } else { + std::complex lhs0, lhs1; + if (UseLhs) { + lhs0 = lhs(j + 0, i); + lhs1 = lhs(j + 1, i); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(j + 2, i); + lhs1 = lhs(j + 3, i); + cblock.packet[1] = pload2(&lhs0, &lhs1); + } else { + lhs0 = lhs(i, j + 0); + lhs1 = lhs(i, j + 1); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(i, j + 2); + lhs1 = lhs(i, j + 3); + cblock.packet[1] = pload2(&lhs0, &lhs1); + } + } + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32); + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + } + + pstore(blockAt + rir, blockr.packet[0]); + pstore(blockAt + rii, blocki.packet[0]); + + rir += vectorSize; + rii += vectorSize; + } + + rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if (UseLhs) { + blockAt[rir] = lhs(k, i).real(); + + if(Conjugate) + blockAt[rii] = -lhs(k, i).imag(); + else + blockAt[rii] = lhs(k, i).imag(); + } else { + blockAt[rir] = lhs(i, k).real(); + + if(Conjugate) + blockAt[rii] = -lhs(i, k).imag(); + else + blockAt[rii] = lhs(i, k).imag(); + } + + rir += 1; + rii += 1; + } + } + } + } +}; + +// General template for lhs & rhs packing. +template +struct dhs_pack{ + EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + if(PanelMode) ri += vectorSize*offset; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + + if (UseLhs) { + bload(block, lhs, j, i); + } else { + bload(block, lhs, i, j); + } + if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) + { + ptranspose(block); + } + + storeBlock(blockA + ri, block); + + ri += 4*vectorSize; + } + for(; i < depth; i++) + { + if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) + { + if (UseLhs) { + blockA[ri+0] = lhs(j+0, i); + blockA[ri+1] = lhs(j+1, i); + blockA[ri+2] = lhs(j+2, i); + blockA[ri+3] = lhs(j+3, i); + } else { + blockA[ri+0] = lhs(i, j+0); + blockA[ri+1] = lhs(i, j+1); + blockA[ri+2] = lhs(i, j+2); + blockA[ri+3] = lhs(i, j+3); + } + } else { + Packet lhsV; + if (UseLhs) { + lhsV = lhs.template loadPacket(j, i); + } else { + lhsV = lhs.template loadPacket(i, j); + } + pstore(blockA + ri, lhsV); + } + + ri += vectorSize; + } + + if(PanelMode) ri += vectorSize*(stride - offset - depth); + } + + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + if (UseLhs) { + blockA[ri] = lhs(k, i); + } else { + blockA[ri] = lhs(i, k); + } + ri += 1; + } + } + } + } +}; + +// General template for lhs packing, float64 specialization. +template +struct dhs_pack +{ + EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + if(PanelMode) ri += vectorSize*offset; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + if(StorageOrder == RowMajor) + { + block.packet[0] = lhs.template loadPacket(j + 0, i); + block.packet[1] = lhs.template loadPacket(j + 1, i); + + ptranspose(block); + } else { + block.packet[0] = lhs.template loadPacket(j, i + 0); + block.packet[1] = lhs.template loadPacket(j, i + 1); + } + + storeBlock(blockA + ri, block); + + ri += 2*vectorSize; + } + for(; i < depth; i++) + { + if(StorageOrder == RowMajor) + { + blockA[ri+0] = lhs(j+0, i); + blockA[ri+1] = lhs(j+1, i); + } else { + Packet2d lhsV = lhs.template loadPacket(j, i); + pstore(blockA + ri, lhsV); + } + + ri += vectorSize; + } + + if(PanelMode) ri += vectorSize*(stride - offset - depth); + } + + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } + } + } +}; + +// General template for rhs packing, float64 specialization. +template +struct dhs_pack +{ + EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + Index ri = 0, j = 0; + + for(; j + 2*vectorSize <= cols; j+=2*vectorSize) + { + Index i = 0; + + if(PanelMode) ri += offset*(2*vectorSize); + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock block; + if(StorageOrder == ColMajor) + { + PacketBlock block1, block2; + block1.packet[0] = rhs.template loadPacket(i, j + 0); + block1.packet[1] = rhs.template loadPacket(i, j + 1); + block2.packet[0] = rhs.template loadPacket(i, j + 2); + block2.packet[1] = rhs.template loadPacket(i, j + 3); + + ptranspose(block1); + ptranspose(block2); + + pstore(blockB + ri , block1.packet[0]); + pstore(blockB + ri + 2, block2.packet[0]); + pstore(blockB + ri + 4, block1.packet[1]); + pstore(blockB + ri + 6, block2.packet[1]); + } else { + block.packet[0] = rhs.template loadPacket(i + 0, j + 0); //[a1 a2] + block.packet[1] = rhs.template loadPacket(i + 0, j + 2); //[a3 a4] + block.packet[2] = rhs.template loadPacket(i + 1, j + 0); //[b1 b2] + block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] + + storeBlock(blockB + ri, block); + } + + ri += 4*vectorSize; + } + for(; i < depth; i++) + { + if(StorageOrder == ColMajor) + { + blockB[ri+0] = rhs(i, j+0); + blockB[ri+1] = rhs(i, j+1); + + ri += vectorSize; + + blockB[ri+0] = rhs(i, j+2); + blockB[ri+1] = rhs(i, j+3); + } else { + Packet2d rhsV = rhs.template loadPacket(i, j); + pstore(blockB + ri, rhsV); + + ri += vectorSize; + + rhsV = rhs.template loadPacket(i, j + 2); + pstore(blockB + ri, rhsV); + } + ri += vectorSize; + } + + if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth); + } + + if (j < cols) + { + if(PanelMode) ri += offset*(cols - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + blockB[ri] = rhs(i, k); + ri += 1; + } + } + } + } +}; + +// General template for lhs complex packing, float64 specialization. +template +struct dhs_cpack +{ + EIGEN_STRONG_INLINE void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii; + double* blockAt = reinterpret_cast(blockA); + Index j = 0; + + for(; j + vectorSize <= rows; j+=vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i + vectorSize <= depth; i+=vectorSize) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + if(StorageOrder == ColMajor) + { + cblock.packet[0] = lhs.template loadPacket(j, i + 0); //[a1 a1i] + cblock.packet[1] = lhs.template loadPacket(j, i + 1); //[b1 b1i] + + cblock.packet[2] = lhs.template loadPacket(j + 1, i + 0); //[a2 a2i] + cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i] + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64); //[a1 a2] + blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64); + } else { + cblock.packet[0] = lhs.template loadPacket(j + 0, i); //[a1 a1i] + cblock.packet[1] = lhs.template loadPacket(j + 1, i); //[a2 a2i] + + cblock.packet[2] = lhs.template loadPacket(j + 0, i + 1); //[b1 b1i] + cblock.packet[3] = lhs.template loadPacket(j + 1, i + 1); //[b2 b2i + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); //[a1 a2] + blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); //[b1 b2] + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + } + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + } + + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; + } + for(; i < depth; i++) + { + PacketBlock blockr, blocki; + PacketBlock cblock; + + cblock.packet[0] = lhs.template loadPacket(j + 0, i); + cblock.packet[1] = lhs.template loadPacket(j + 1, i); + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + } + + pstore(blockAt + rir, blockr.packet[0]); + pstore(blockAt + rii, blocki.packet[0]); + + rir += vectorSize; + rii += vectorSize; + } + + rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockAt[rir] = lhs(k, i).real(); + + if(Conjugate) + blockAt[rii] = -lhs(k, i).imag(); + else + blockAt[rii] = lhs(k, i).imag(); + + rir += 1; + rii += 1; + } + } + } + } +}; + +// General template for rhs complex packing, float64 specialization. +template +struct dhs_cpack +{ + EIGEN_STRONG_INLINE void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) + { + const Index vectorSize = quad_traits::vectorsize; + const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth); + Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii; + double* blockBt = reinterpret_cast(blockB); + Index j = 0; + + for(; j + 2*vectorSize <= cols; j+=2*vectorSize) + { + Index i = 0; + + rii = rir + vectorDelta; + + for(; i < depth; i++) + { + PacketBlock cblock; + PacketBlock blockr, blocki; + + bload(cblock, rhs, i, j); + + blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); + blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); + + blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64); + blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64); + + if(Conjugate) + { + blocki.packet[0] = -blocki.packet[0]; + blocki.packet[1] = -blocki.packet[1]; + } + + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); + + rir += 2*vectorSize; + rii += 2*vectorSize; + } + + rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta); + } + + if (j < cols) + { + if(PanelMode) rir += (offset*(cols - j - 2*vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (cols - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < cols; k++) + { + blockBt[rir] = rhs(i, k).real(); + + if(Conjugate) + blockBt[rii] = -rhs(i, k).imag(); + else + blockBt[rii] = rhs(i, k).imag(); + + rir += 1; + rii += 1; + } + } + } + } +}; + +/************** + * GEMM utils * + **************/ + +// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm). +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +{ + if(NegativeAccumulate) + { + acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); + acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); + acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + } else { + acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); + acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); + acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); + } +} + +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +{ + if(NegativeAccumulate) + { + acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + } else { + acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + } +} + +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +{ + Packet lhsV = pload(lhs); + + pger_common(acc, lhsV, rhsV); +} + +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +{ +#ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); +#else + Index i = 0; + do { + lhsV[i] = lhs[i]; + } while (++i < remaining_rows); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +{ + Packet lhsV; + loadPacketRemaining(lhs, lhsV, remaining_rows); + + pger_common(acc, lhsV, rhsV); +} + +// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. +template +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +{ + pger_common(accReal, lhsV, rhsV); + if(LhsIsReal) + { + pger_common(accImag, lhsV, rhsVi); + EIGEN_UNUSED_VARIABLE(lhsVi); + } else { + if (!RhsIsReal) { + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); + } else { + EIGEN_UNUSED_VARIABLE(rhsVi); + } + pger_common(accImag, lhsVi, rhsV); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +{ + Packet lhsV = ploadLhs(lhs_ptr); + Packet lhsVi; + if(!LhsIsReal) lhsVi = ploadLhs(lhs_ptr_imag); + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + + pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); +} + +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +{ +#ifdef _ARCH_PWR9 + lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); + if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows * sizeof(Scalar)); + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +#else + Index i = 0; + do { + lhsV[i] = lhs_ptr[i]; + if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i]; + } while (++i < remaining_rows); + if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +{ + Packet lhsV, lhsVi; + loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); + + pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); +} + +template +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) +{ + return ploadu(lhs); +} + +// Zero the accumulator on PacketBlock. +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +{ + acc.packet[0] = pset1((Scalar)0); + acc.packet[1] = pset1((Scalar)0); + acc.packet[2] = pset1((Scalar)0); + acc.packet[3] = pset1((Scalar)0); +} + +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +{ + acc.packet[0] = pset1((Scalar)0); +} + +// Scale the PacketBlock vectors by alpha. +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); + acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); + acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); + acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); +} + +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmul(accZ.packet[0], pAlpha); + acc.packet[1] = pmul(accZ.packet[1], pAlpha); + acc.packet[2] = pmul(accZ.packet[2], pAlpha); + acc.packet[3] = pmul(accZ.packet[3], pAlpha); +} + +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +{ + acc.packet[0] = pmul(accZ.packet[0], pAlpha); +} + +// Complex version of PacketBlock scaling. +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +{ + bscalec_common(cReal, aReal, bReal); + + bscalec_common(cImag, aImag, bReal); + + pger_common(&cReal, bImag, aImag.packet); + + pger_common(&cImag, bImag, aReal.packet); +} + +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) +{ + acc.packet[0] = pand(acc.packet[0], pMask); + acc.packet[1] = pand(acc.packet[1], pMask); + acc.packet[2] = pand(acc.packet[2], pMask); + acc.packet[3] = pand(acc.packet[3], pMask); +} + +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +{ + band(aReal, pMask); + band(aImag, pMask); + + bscalec(aReal, aImag, bReal, bImag, cReal, cImag); +} + +// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + if (StorageOrder == RowMajor) { + acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); + acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); + acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); + acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + } else { + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); + acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); + acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); + } +} + +// An overload of bload when you have a PacketBLock with 8 vectors. +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + if (StorageOrder == RowMajor) { + acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); + acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); + acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); + acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); + acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); + acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); + acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); + } else { + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); + acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); + acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); + acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); + acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); + acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); + acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); + } +} + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +{ + acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); + acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); +} + +const static Packet4i mask41 = { -1, 0, 0, 0 }; +const static Packet4i mask42 = { -1, -1, 0, 0 }; +const static Packet4i mask43 = { -1, -1, -1, 0 }; + +const static Packet2l mask21 = { -1, 0 }; + +template +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) +{ + if (remaining_rows == 0) { + return pset1(float(0.0)); // Not used + } else { + switch (remaining_rows) { + case 1: return Packet(mask41); + case 2: return Packet(mask42); + default: return Packet(mask43); + } + } +} + +template<> +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) +{ + if (remaining_rows == 0) { + return pset1(double(0.0)); // Not used + } else { + return Packet2d(mask21); + } +} + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +{ + band(accZ, pMask); + + bscale(acc, accZ, pAlpha); +} + +template +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + pbroadcast4(a, a0, a1, a2, a3); +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a3 = pload(a + 2); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +// PEEL loop factor. +#define PEEL 7 + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, + PacketBlock &accZero, + Index remaining_rows, + Index remaining_cols) +{ + Packet rhsV[1]; + rhsV[0] = pset1(rhs_ptr[0]); + pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += remaining_cols; +} + +template +EIGEN_STRONG_INLINE void gemm_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; + PacketBlock accZero; + + bsetzero(accZero); + + Index remaining_depth = (depth & -accRows); + Index k = 0; + for(; k + PEEL <= remaining_depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + for (int l = 0; l < PEEL; l++) { + MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); + } + } + for(; k < remaining_depth; k++) + { + MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); + } + for(; k < depth; k++) + { + Packet rhsV[1]; + rhsV[0] = pset1(rhs_ptr[0]); + pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + lhs_ptr += remaining_rows; + rhs_ptr += remaining_cols; + } + + accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, col) += accZero.packet[0][i]; + } +} + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, + PacketBlock &accZero, + Index remaining_rows) +{ + Packet rhsV[4]; + pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; +} + +template +EIGEN_STRONG_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; + PacketBlock accZero, acc; + + bsetzero(accZero); + + Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index k = 0; + for(; k + PEEL <= remaining_depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + for (int l = 0; l < PEEL; l++) { + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + } + } + for(; k < remaining_depth; k++) + { + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { + for(Index j = 0; j < 4; j++) { + acc.packet[j] = res.template loadPacket(row, col + j); + } + bscale(acc, accZero, pAlpha, pMask); + res.template storePacketBlock(row, col, acc); + } else { + for(; k < depth; k++) + { + Packet rhsV[4]; + pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; + } + + for(Index j = 0; j < 4; j++) { + accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); + } + for(Index j = 0; j < 4; j++) { + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, col + j) += accZero.packet[j][i]; + } + } + } +} + +#define MICRO_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_UNROLL_WORK(func, func2, peel) \ + MICRO_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ + func(4,peel) func(5,peel) func(6,peel) func(7,peel) + +#define MICRO_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + } + +#define MICRO_WORK_ONE(iter, peel) \ + if (unroll_factor > iter) { \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + } + +#define MICRO_TYPE_PEEL4(func, func2, peel) \ + if (PEEL > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + MICRO_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_TYPE_PEEL1(func, func2, peel) \ + if (PEEL > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + MICRO_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + func(func1,func2,0); func(func1,func2,1); \ + func(func1,func2,2); func(func1,func2,3); \ + func(func1,func2,4); func(func1,func2,5); \ + func(func1,func2,6); func(func1,func2,7); \ + func(func1,func2,8); func(func1,func2,9); + +#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M]; \ + func(func1,func2,0); + +#define MICRO_ONE_PEEL4 \ + MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += (accRows * PEEL); + +#define MICRO_ONE4 \ + MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += accRows; + +#define MICRO_ONE_PEEL1 \ + MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += (remaining_cols * PEEL); + +#define MICRO_ONE1 \ + MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ + rhs_ptr += remaining_cols; + +#define MICRO_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzero(accZero##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##iter); \ + } + +#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE) + +#define MICRO_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } + +#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE) + +#define MICRO_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ + } + +#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE) + +#define MICRO_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ + acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ + acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ + acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, col, acc); \ + } + +#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) + +#define MICRO_COL_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, col, acc); \ + } + +#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_unrolled_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; + + MICRO_SRC_PTR + MICRO_DST_PTR + + Index k = 0; + for(; k + PEEL <= depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_PREFETCH + MICRO_ONE_PEEL4 + } + for(; k < depth; k++) + { + MICRO_ONE4 + } + MICRO_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + Index remaining_cols, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; + + MICRO_SRC_PTR + MICRO_DST_PTR + + Index k = 0; + for(; k + PEEL <= depth; k+= PEEL) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_PREFETCH + MICRO_ONE_PEEL1 + } + for(; k < depth; k++) + { + MICRO_ONE1 + } + MICRO_COL_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlpha) +{ +#define MAX_UNROLL 6 + while(row + MAX_UNROLL*accCols <= rows) { + gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_UNROLL > 7 + case 7: + gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 6 + case 6: + gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 5 + case 5: + gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 4 + case 4: + gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 3 + case 3: + gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 2 + case 2: + gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif +#if MAX_UNROLL > 1 + case 1: + gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_UNROLL +} + +/**************** + * GEMM kernels * + * **************/ +template +EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_UNROLL 6 + while(row + MAX_UNROLL*accCols <= rows) { + gemm_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_UNROLL > 7 + case 7: + gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 6 + case 6: + gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 5 + case 5: + gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 4 + case 4: + gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 3 + case 3: + gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 2 + case 2: + gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_UNROLL > 1 + case 1: + gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + + if (remaining_rows > 0) + { + gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); + } + rhs_base++; + } + } +} + +#define accColsC (accCols / 2) +#define advanceRows ((LhsIsReal) ? 1 : 2) +#define advanceCols ((RhsIsReal) ? 1 : 2) + +// PEEL_COMPLEX loop factor. +#define PEEL_COMPLEX 3 + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( + const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, + const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, + PacketBlock &accReal, PacketBlock &accImag, + Index remaining_rows, + Index remaining_cols) +{ + Packet rhsV[1], rhsVi[1]; + rhsV[0] = pset1(rhs_ptr_real[0]); + if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); + pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + rhs_ptr_real += remaining_cols; + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +} + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; + const Scalar* lhs_ptr_imag; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + PacketBlock accReal, accImag; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + + bsetzero(accReal); + bsetzero(accImag); + + Index remaining_depth = (depth & -accRows); + Index k = 0; + for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + for (int l = 0; l < PEEL_COMPLEX; l++) { + MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); + } + } + for(; k < remaining_depth; k++) + { + MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); + } + + for(; k < depth; k++) + { + Packet rhsV[1], rhsVi[1]; + rhsV[0] = pset1(rhs_ptr_real[0]); + if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); + pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += remaining_cols; + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + } + + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { + res(row + 0, col + 0) += pfirst(acc0.packet[0]); + } else { + acc0.packet[0] += res.template loadPacket(row + 0, col + 0); + res.template storePacketBlock(row + 0, col + 0, acc0); + if(remaining_rows > accColsC) { + res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); + } + } +} + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( + const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, + const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, + PacketBlock &accReal, PacketBlock &accImag, + Index remaining_rows) +{ + Packet rhsV[4], rhsVi[4]; + pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); +} + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; + const Scalar* lhs_ptr_imag; + if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); + PacketBlock accReal, accImag; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + bsetzero(accReal); + bsetzero(accImag); + + Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index k = 0; + for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + for (int l = 0; l < PEEL_COMPLEX; l++) { + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + } + } + for(; k < remaining_depth; k++) + { + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + } + + if ((remaining_depth == depth) && (rows >= accCols)) + { + bload(tRes, res, row, col); + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + res.template storePacketBlock(row + 0, col, acc0); + res.template storePacketBlock(row + accColsC, col, acc1); + } else { + for(; k < depth; k++) + { + Packet rhsV[4], rhsVi[4]; + pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + } + + bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { + for(Index j = 0; j < 4; j++) { + res(row + 0, col + j) += pfirst(acc0.packet[j]); + } + } else { + for(Index j = 0; j < 4; j++) { + PacketBlock acc2; + acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; + res.template storePacketBlock(row + 0, col + j, acc2); + if(remaining_rows > accColsC) { + res(row + accColsC, col + j) += pfirst(acc1.packet[j]); + } + } + } + } +} + +#define MICRO_COMPLEX_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) + +#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + MICRO_COMPLEX_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) + +#define MICRO_COMPLEX_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ + lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ + lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ + lhs_ptr_imag##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } + +#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ + if (unroll_factor > iter) { \ + pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ + if (unroll_factor > iter) { \ + pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ + if (PEEL_COMPLEX > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) { \ + pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ + if (PEEL_COMPLEX > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ + if(!RhsIsReal) { \ + rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ + func(func1,func2,0); func(func1,func2,1); \ + func(func1,func2,2); func(func1,func2,3); \ + func(func1,func2,4); func(func1,func2,5); \ + func(func1,func2,6); func(func1,func2,7); \ + func(func1,func2,8); func(func1,func2,9); + +#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ + Packet rhsV0[M], rhsVi0[M];\ + func(func1,func2,0); + +#define MICRO_COMPLEX_ONE_PEEL4 \ + MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += (accRows * PEEL_COMPLEX); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX); + +#define MICRO_COMPLEX_ONE4 \ + MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += accRows; \ + if(!RhsIsReal) rhs_ptr_imag += accRows; + +#define MICRO_COMPLEX_ONE_PEEL1 \ + MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ + if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); + +#define MICRO_COMPLEX_ONE1 \ + MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ + rhs_ptr_real += remaining_cols; \ + if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + +#define MICRO_COMPLEX_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##iter); \ + EIGEN_UNUSED_VARIABLE(accImag##iter); \ + } + +#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE) + +#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ + if(!LhsIsReal) { \ + lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + +#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) + +#define MICRO_COMPLEX_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ + if(!LhsIsReal) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ + } \ + } + +#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) + +#define MICRO_COMPLEX_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + bload(tRes, res, row + iter*accCols, col); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + } + +#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) + +#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + bload(tRes, res, row + iter*accCols, col); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + } + +#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock accReal4, accImag4; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + MICRO_COMPLEX_SRC_PTR + MICRO_COMPLEX_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_PREFETCH + MICRO_COMPLEX_ONE_PEEL4 + } + for(; k < depth; k++) + { + MICRO_COMPLEX_ONE4 + } + MICRO_COMPLEX_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + remaining_cols*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock accReal4, accImag4; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; + + MICRO_COMPLEX_SRC_PTR + MICRO_COMPLEX_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_PREFETCH + MICRO_COMPLEX_ONE_PEEL1 + } + for(; k < depth; k++) + { + MICRO_COMPLEX_ONE1 + } + MICRO_COMPLEX_COL_STORE + + row += unroll_factor*accCols; +} + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_UNROLL +} + +template +EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlphaReal = pset1(alpha.real()); + const Packet pAlphaImag = pset1(alpha.imag()); + const Packet pMask = bmask((const int)(remaining_rows)); + + const Scalar* blockA = (Scalar *) blockAc; + const Scalar* blockB = (Scalar *) blockBc; + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); + + if (remaining_rows > 0) + { + gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); + } + rhs_base++; + } + } +} + +#undef accColsC +#undef advanceCols +#undef advanceRows + +/************************************ + * ppc64le template specializations * + * **********************************/ +template +struct gemm_pack_lhs +{ + void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs +{ + void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK +template +struct gemm_pack_rhs +{ + void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs +{ + void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} +#endif + +template +struct gemm_pack_lhs +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK +template +struct gemm_pack_rhs +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_pack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} +#endif + +template +struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockA, lhs, depth, rows, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +template +struct gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> +{ + void operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(std::complex* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + dhs_cpack pack; + pack(blockB, rhs, depth, cols, stride, offset); +} + +// ********* gebp specializations ********* +template +struct gebp_kernel +{ + typedef typename quad_traits::vectortype Packet; + typedef typename quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const float* blockA, const float* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const float* blockA, const float* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemmMMA; + } + else{ + gemm_function = &Eigen::internal::gemm; + } + #else + gemm_function = &Eigen::internal::gemm; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const float* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const float*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef Packet4f Packet; + typedef Packet2cf Packetc; + typedef Packet4f RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const float* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const float*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel +{ + typedef typename quad_traits::vectortype Packet; + typedef typename quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const double* blockA, const double* blockB, + Index rows, Index depth, Index cols, double alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const double* blockA, const double* blockB, + Index rows, Index depth, Index cols, double alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); + + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemmMMA; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemmMMA; + } + else{ + gemm_function = &Eigen::internal::gemm; + } + #else + gemm_function = &Eigen::internal::gemm; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, std::complex, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const std::complex* blockA, const double* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const std::complex*, const double*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } + +template +struct gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> +{ + typedef quad_traits::vectortype Packet; + typedef Packet1cd Packetc; + typedef quad_traits::rhstype RhsPacket; + + void operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> + ::operator()(const DataMapper& res, const double* blockA, const std::complex* blockB, + Index rows, Index depth, Index cols, std::complex alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const Index accRows = quad_traits::rows; + const Index accCols = quad_traits::size; + void (*gemm_function)(const DataMapper&, const double*, const std::complex*, + Index, Index, Index, std::complex, Index, Index, Index, Index); + #ifdef EIGEN_ALTIVEC_MMA_ONLY + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + } +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h new file mode 100644 index 00000000..33d54349 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -0,0 +1,221 @@ +//#define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines +#ifdef EIGEN_POWER_USE_PREFETCH +#define EIGEN_POWER_PREFETCH(p) prefetch(p) +#else +#define EIGEN_POWER_PREFETCH(p) +#endif + +namespace Eigen { + +namespace internal { + +template +EIGEN_STRONG_INLINE void gemm_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlpha); + +template +EIGEN_STRONG_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask); + +template +EIGEN_STRONG_INLINE void gemm_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlpha); + +template +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index remaining_rows, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag); + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask); + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index rows, + Index col, + Index remaining_cols, + const Packet& pAlphaReal, + const Packet& pAlphaImag); + +template +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); + +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); + +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); + +const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, + 16, 17, 18, 19, + 4, 5, 6, 7, + 20, 21, 22, 23}; + +const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, + 24, 25, 26, 27, + 12, 13, 14, 15, + 28, 29, 30, 31}; +//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 +const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, + 16, 17, 18, 19, 20, 21, 22, 23}; + +//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 +const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, + 24, 25, 26, 27, 28, 29, 30, 31}; + + +// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); +} + +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +{ + bcouple_common(taccReal, taccImag, acc1, acc2); + + acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); + acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); + acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); + acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + + acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); + acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); + acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); + acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); +} + +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +{ + bcouple_common(taccReal, taccImag, acc1, acc2); + + acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); + + acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); +} + +template<> +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); + acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); +} + +template<> +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +{ + acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); + + acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); +} + +// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +template +EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) +{ + return ploadu(rhs); +} + +} // end namespace internal +} // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h new file mode 100644 index 00000000..6540c6fa --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -0,0 +1,629 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + +#pragma GCC target("cpu=power10") + +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#endif + +namespace Eigen { + +namespace internal { + +template +EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) +{ + __builtin_mma_xxsetaccz(acc); +} + +template +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +{ + PacketBlock result; + __builtin_mma_disassemble_acc(&result.packet, acc); + + PacketBlock tRes; + bload(tRes, data, i, j); + + bscale(tRes, result, alpha); + + data.template storePacketBlock(i, j, tRes); +} + +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +{ + PacketBlock resultReal, resultImag; + __builtin_mma_disassemble_acc(&resultReal.packet, accReal); + __builtin_mma_disassemble_acc(&resultImag.packet, accImag); + + PacketBlock tRes; + bload(tRes, data, i, j); + + PacketBlock taccReal, taccImag; + bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); + + PacketBlock acc1, acc2; + bcouple(taccReal, taccImag, tRes, acc1, acc2); + + data.template storePacketBlock(i + N*accColsC, j, acc1); + data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); +} + +// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +{ + if(NegativeAccumulate) + { + __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } else { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) +{ + __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); + if(NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, *a0, (__vector unsigned char)b); + } else { + __builtin_mma_xvf64gerpp(acc, *a0, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) +{ + if(NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b); + } else { + __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) +{ + // Just for compilation +} + +template +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +{ + pgerMMA(accReal, rhsV, lhsV); + if(LhsIsReal) { + pgerMMA(accImag, rhsVi, lhsV); + } else { + if(!RhsIsReal) { + pgerMMA(accReal, rhsVi, lhsVi); + pgerMMA(accImag, rhsVi, lhsV); + } else { + EIGEN_UNUSED_VARIABLE(rhsVi); + } + pgerMMA(accImag, rhsV, lhsVi); + } +} + +// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. +template +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) +{ + rhsV = ploadRhs((const Scalar*)(rhs)); +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) +{ + rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); + rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +{ +#if EIGEN_COMP_LLVM + __builtin_vsx_assemble_pair(&rhsV, + (__vector unsigned char)(ploadRhs((const double *)(((Packet2d *)rhs) + 1))), + (__vector unsigned char)(ploadRhs((const double *)((Packet2d *)rhs )))); +#else + __asm__ ("lxvp %x0,%1" : "=wa" (rhsV) : "Y" (*rhs)); +#endif +} + +template<> +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) +{ + // Just for compilation +} + +// PEEL_MMA loop factor. +#define PEEL_MMA 7 + +#define MICRO_MMA_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_MMA_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + } + +#define MICRO_MMA_WORK_ONE(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgerMMA(&accZero##iter, rhsV##peel, lhsV##iter); \ + } + +#define MICRO_MMA_TYPE_PEEL(func, func2, type, peel) \ + if (PEEL_MMA > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ + ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV##peel); \ + MICRO_MMA_UNROLL(func2); \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ + func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ + MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); + +#define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0; \ + MICRO_MMA_TYPE_PEEL(func,func2,type,0); + +#define MICRO_MMA_ONE_PEEL \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_MMA_UNROLL_TYPE_PEEL(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr += (accRows * PEEL_MMA); + +#define MICRO_MMA_ONE \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_MMA_UNROLL_TYPE_ONE(MICRO_MMA_WORK_ONE, MICRO_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr += accRows; + +#define MICRO_MMA_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzeroMMA(&accZero##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##iter); \ + } + +#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE) + +#define MICRO_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ + } + +#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_MMA_SRC_PTR_ONE) + +#define MICRO_MMA_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr##iter); \ + } + +#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_MMA_PREFETCH_ONE) + +#define MICRO_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + } + +#define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index& row, + Index col, + const Packet& pAlpha) +{ + const Scalar* rhs_ptr = rhs_base; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + + MICRO_MMA_SRC_PTR + MICRO_MMA_DST_PTR + + Index k = 0; + for(; k + PEEL_MMA <= depth; k+= PEEL_MMA) + { + EIGEN_POWER_PREFETCH(rhs_ptr); + MICRO_MMA_PREFETCH + MICRO_MMA_ONE_PEEL + } + for(; k < depth; k++) + { + MICRO_MMA_ONE + } + MICRO_MMA_STORE + + row += unroll_factor*accCols; +} + +template +void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + + Index row = 0; +#define MAX_MMA_UNROLL 7 + while(row + MAX_MMA_UNROLL*accCols <= rows) { + gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_MMA_UNROLL > 7 + case 7: + gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 6 + case 6: + gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 5 + case 5: + gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 4 + case 4: + gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 3 + case 3: + gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 2 + case 2: + gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 1 + case 1: + gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + + if (remaining_rows > 0) + { + gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); + } + rhs_base++; + } + } +} + +#define accColsC (accCols / 2) +#define advanceRows ((LhsIsReal) ? 1 : 2) +#define advanceCols ((RhsIsReal) ? 1 : 2) + +// PEEL_COMPLEX_MMA loop factor. +#define PEEL_COMPLEX_MMA 7 + +#define MICRO_COMPLEX_MMA_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) + +#define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ + lhs_ptr_real##iter += accCols; \ + if(!LhsIsReal) { \ + lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ + lhs_ptr_imag##iter += accCols; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ + EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ + } + +#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \ + if (unroll_factor > iter) { \ + pgercMMA(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + } + +#define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ + if (PEEL_COMPLEX_MMA > peel) { \ + Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ + if(!RhsIsReal) { \ + ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } \ + MICRO_COMPLEX_MMA_UNROLL(func2); \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); + +#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ + type rhsV0, rhsVi0; \ + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); + +#define MICRO_COMPLEX_MMA_ONE_PEEL \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr_real += (accRows * PEEL_COMPLEX_MMA); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_MMA); + +#define MICRO_COMPLEX_MMA_ONE \ + if (sizeof(Scalar) == sizeof(float)) { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, RhsPacket); \ + } else { \ + MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_MMA_LOAD_ONE, __vector_pair); \ + } \ + rhs_ptr_real += accRows; \ + if(!RhsIsReal) rhs_ptr_imag += accRows; + +#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + bsetzeroMMA(&accReal##iter); \ + bsetzeroMMA(&accImag##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##iter); \ + EIGEN_UNUSED_VARIABLE(accImag##iter); \ + } + +#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE) + +#define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ + if (unroll_factor > iter) { \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ + if(!LhsIsReal) { \ + lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ + EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ + } + +#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) + +#define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ + if (unroll_factor > iter) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ + if(!LhsIsReal) { \ + EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ + } \ + } + +#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) + +#define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ + if (unroll_factor > iter) { \ + storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + } + +#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) + +template +EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index& row, + Index col, + const Packet& pAlphaReal, + const Packet& pAlphaImag) +{ + const Scalar* rhs_ptr_real = rhs_base; + const Scalar* rhs_ptr_imag; + if(!RhsIsReal) { + rhs_ptr_imag = rhs_base + accRows*strideB; + } else { + EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + } + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + + MICRO_COMPLEX_MMA_SRC_PTR + MICRO_COMPLEX_MMA_DST_PTR + + Index k = 0; + for(; k + PEEL_COMPLEX_MMA <= depth; k+= PEEL_COMPLEX_MMA) + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + MICRO_COMPLEX_MMA_PREFETCH + MICRO_COMPLEX_MMA_ONE_PEEL + } + for(; k < depth; k++) + { + MICRO_COMPLEX_MMA_ONE + } + MICRO_COMPLEX_MMA_STORE + + row += unroll_factor*accCols; +} + +template +void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; + const Index remaining_cols = cols % accRows; + + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; + + const Packet pAlphaReal = pset1(alpha.real()); + const Packet pAlphaImag = pset1(alpha.imag()); + const Packet pMask = bmask((const int)(remaining_rows)); + + const Scalar* blockA = (Scalar *) blockAc; + const Scalar* blockB = (Scalar *) blockBc; + + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } + } + + if(remaining_cols > 0) + { + const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; + const Scalar* lhs_base = blockA; + + for(; col < cols; col++) + { + Index row = 0; + + gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); + + if (remaining_rows > 0) + { + gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); + } + rhs_base++; + } + } +} + +#undef accColsC +#undef advanceRows +#undef advanceCols + +#pragma GCC reset_options +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H + diff --git a/externals/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h index b3f1ea19..2a440545 100644 --- a/externals/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,31 +22,38 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -typedef __vector float Packet4f; -typedef __vector int Packet4i; -typedef __vector unsigned int Packet4ui; -typedef __vector __bool int Packet4bi; -typedef __vector short int Packet8i; -typedef __vector unsigned char Packet16uc; +typedef __vector float Packet4f; +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8s; +typedef __vector unsigned short int Packet8us; +typedef __vector signed char Packet16c; +typedef __vector unsigned char Packet16uc; +typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead - #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + Packet4f p4f_##NAME = {X, X, X, X} #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) +#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ + Packet4ui p4ui_##NAME = {X, X, X, X} + +#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ + Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} + +#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ + Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1(X) @@ -64,7 +71,7 @@ typedef __vector unsigned char Packet16uc; #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) - +#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits::type // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} @@ -72,25 +79,36 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); +static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); +static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} +static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} #ifndef __VSX__ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; -static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; +static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; + +static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; +static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; -static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; +static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; -// Mask alignment -#ifdef __PPC64__ -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#else -#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 -#endif +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; +static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; +static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 }; +static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 }; -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 }; // Handle endianness properly while loading constants // Define global static constants: @@ -103,7 +121,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -129,27 +147,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 1, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, #ifdef __VSX__ HasSqrt = 1, #if !EIGEN_COMP_CLANG @@ -160,16 +178,62 @@ template<> struct packet_traits : default_packet_traits #else HasSqrt = 0, HasRsqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, #endif HasRound = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasNegate = 1, HasBlend = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { + typedef Packet8bf type; + typedef Packet8bf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, +#ifdef __VSX__ + HasSqrt = 1, +#if !EIGEN_COMP_CLANG + HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif +#else + HasSqrt = 0, + HasRsqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, +#endif + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { typedef Packet4i type; typedef Packet4i half; enum { @@ -178,6 +242,79 @@ template<> struct packet_traits : default_packet_traits size = 4, HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8s type; + typedef Packet8s half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8us type; + typedef Packet8us half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16c type; + typedef Packet16c half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16uc type; + typedef Packet16uc half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 1, HasSub = 1, HasMul = 1, @@ -186,9 +323,62 @@ template<> struct packet_traits : default_packet_traits }; }; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef short int type; + typedef Packet8s half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef unsigned short int type; + typedef Packet8us half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; + +template<> struct unpacket_traits +{ + typedef signed char type; + typedef Packet16c half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits +{ + typedef unsigned char type; + typedef Packet16uc half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits +{ + typedef bfloat16 type; + typedef Packet8bf half; + enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +inline std::ostream & operator <<(std::ostream & s, const Packet16c & v) +{ + union { + Packet16c v; + signed char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << vt.n[i] << ", "; + return s; +} inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) { @@ -198,7 +388,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) } vt; vt.v = v; for (int i=0; i< 16; i++) - s << (int)vt.n[i] << ", "; + s << vt.n[i] << ", "; return s; } @@ -235,146 +425,395 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) return s; } -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +template +EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) { + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_vsx_ld(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); #endif } +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + return pload_common(from); +} + template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return pload_common(from); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE Packet8s pload(const short int* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet8us pload(const unsigned short int* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet16c pload(const signed char* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pload(const unsigned char* from) +{ + return pload_common(from); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pload(const bfloat16* from) { + return pload_common(reinterpret_cast(from)); +} + +template +EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE #ifdef __VSX__ - vec_vsx_st(from, 0, to); + vec_xst(from, 0, to); #else vec_st(from, 0, to); #endif } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + pstore_common(to, from); +} + template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + pstore_common(to, from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f v = {from, from, from, from}; +template<> EIGEN_STRONG_INLINE void pstore(short int* to, const Packet8s& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(unsigned short int* to, const Packet8us& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet8bf& from) +{ + pstore_common(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstore(signed char* to, const Packet16c& from) +{ + pstore_common(to, from); +} + +template<> EIGEN_STRONG_INLINE void pstore(unsigned char* to, const Packet16uc& from) +{ + pstore_common(to, from); +} + +template +EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - Packet4i v = {from, from, from, from}; +template +EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from}; return v; } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) + +template +EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; + return v; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { + return pset1_size4(from); +} + +template<> EIGEN_STRONG_INLINE Packet8s pset1(const short int& from) { + return pset1_size8(from); +} + +template<> EIGEN_STRONG_INLINE Packet8us pset1(const unsigned short int& from) { + return pset1_size8(from); +} + +template<> EIGEN_STRONG_INLINE Packet16c pset1(const signed char& from) { + return pset1_size16(from); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pset1(const unsigned char& from) { + return pset1_size16(from); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { + return reinterpret_cast(pset1(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pset1(const bfloat16& from) { + return pset1_size8(reinterpret_cast(from)); +} + +template EIGEN_STRONG_INLINE void +pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - a3 = pload(a); + a3 = pload(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); a3 = vec_splat(a3, 3); } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4_common(a, a0, a1, a2, a3); +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) { - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); + pbroadcast4_common(a, a0, a1, a2, a3); +} + +template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + return pload(a); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); + return pgather_common(from, stride); } + template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); + return pgather_common(from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) + +template EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) { - float EIGEN_ALIGN16 af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + return pload(a); +} + +template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) +{ + return pgather_size8(from, stride); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) + +template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; + return pgather_size8(from, stride); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } +template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather(const bfloat16* from, Index stride) +{ + return pgather_size8(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } +template EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + a[4] = from[4*stride]; + a[5] = from[5*stride]; + a[6] = from[6*stride]; + a[7] = from[7*stride]; + a[8] = from[8*stride]; + a[9] = from[9*stride]; + a[10] = from[10*stride]; + a[11] = from[11*stride]; + a[12] = from[12*stride]; + a[13] = from[13*stride]; + a[14] = from[14*stride]; + a[15] = from[15*stride]; + return pload(a); +} + + +template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const signed char* from, Index stride) +{ + return pgather_size16(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } +template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const unsigned char* from, Index stride) +{ + return pgather_size16(from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; +} -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + pscatter_size4(to, from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + pscatter_size4(to, from, stride); +} -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +template EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) { -#ifndef __VSX__ // VSX actually provides a div instruction - Packet4f t, y_0, y_1; + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; + to[4*stride] = a[4]; + to[5*stride] = a[5]; + to[6*stride] = a[6]; + to[7*stride] = a[7]; +} - // Altivec does not offer a divide instruction, we have to do a reciprocal approximation - y_0 = vec_re(b); - // Do one Newton-Raphson iteration to get the needed accuracy - t = vec_nmsub(y_0, b, p4f_ONE); - y_1 = vec_madd(y_0, t, y_0); +template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) +{ + pscatter_size8(to, from, stride); +} - return vec_madd(a, y_1, p4f_MZERO); -#else +template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(bfloat16* to, const Packet8bf& from, Index stride) +{ + pscatter_size8(to, from, stride); +} + +template EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; + to[4*stride] = a[4]; + to[5*stride] = a[5]; + to[6*stride] = a[6]; + to[7*stride] = a[7]; + to[8*stride] = a[8]; + to[9*stride] = a[9]; + to[10*stride] = a[10]; + to[11*stride] = a[11]; + to[12*stride] = a[12]; + to[13*stride] = a[13]; + to[14*stride] = a[14]; + to[15*stride] = a[15]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(signed char* to, const Packet16c& from, Index stride) +{ + pscatter_size16(to, from, stride); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned char* to, const Packet16uc& from, Index stride) +{ + pscatter_size16(to, from, stride); +} + +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet8s plset(const short int& a) { return pset1(a) + p8s_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet8us plset(const unsigned short int& a) { return pset1(a) + p8us_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet16c plset(const signed char& a) { return pset1(a) + p16c_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet16uc plset(const unsigned char& a) { return pset1(a) + p16uc_COUNTDOWN; } + +template<> EIGEN_STRONG_INLINE Packet4f padd (const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd (const Packet4i& a, const Packet4i& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4ui padd (const Packet4ui& a, const Packet4ui& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet8s padd (const Packet8s& a, const Packet8s& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet8us padd (const Packet8us& a, const Packet8us& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet16c padd (const Packet16c& a, const Packet16c& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return a + b; } + +template<> EIGEN_STRONG_INLINE Packet4f psub (const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub (const Packet4i& a, const Packet4i& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet8s psub (const Packet8s& a, const Packet8s& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, const Packet8us& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } + +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4f pmul (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul (const Packet4i& a, const Packet4i& b) { return a * b; } +template<> EIGEN_STRONG_INLINE Packet8s pmul (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmul (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmul (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#ifndef __VSX__ // VSX actually provides a div instruction + Packet4f t, y_0, y_1; + + // Altivec does not offer a divide instruction, we have to do a reciprocal approximation + y_0 = vec_re(b); + + // Do one Newton-Raphson iteration to get the needed accuracy + t = vec_nmsub(y_0, b, p4f_ONE); + y_1 = vec_madd(y_0, t, y_0); + + return vec_madd(a, y_1, p4f_MZERO); +#else return vec_div(a, b); #endif } @@ -387,85 +826,247 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } +template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN + Packet4f ret; + __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_min(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN + Packet4f ret; + __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_max(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { + Packet4f c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) { + return pand(a, b); +} + template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) { + return por(a, b); +} template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) { + return pxor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return vec_sel(b, a, reinterpret_cast(mask)); +} -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); + Packet4f res; -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } +#ifdef __VSX__ + __asm__("xvrspiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); +#else + __asm__("vrfiz %0, %1\n\t" + : "=v" (res) + : "v" (t)); +#endif + + return res; +} template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ + Packet4f res; -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) + __asm__("xvrspic %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (a)); + + return res; +} + +template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { EIGEN_DEBUG_ALIGNED_LOAD +#ifdef _BIG_ENDIAN Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword mask = vec_lvsl(0, from); // create the permute mask - return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data + //TODO: Add static_cast here + return (Packet) vec_perm(MSQ, LSQ, mask); // align the data +#else + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + return ploadu_common(from); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data + return ploadu_common(from); } -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); + return ploadu_common(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet8bf ploadu(const bfloat16* from) +{ + return ploadu_common(reinterpret_cast(from)); +} +template<> EIGEN_STRONG_INLINE Packet16c ploadu(const signed char* from) +{ + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char* from) +{ + return ploadu_common(from); } -#endif -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) { - Packet4f p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + Packet p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return ploaddup_common(from); +} template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { - Packet4i p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return ploaddup_common(from); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const short int* from) +{ + Packet8s p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const unsigned short int* from) +{ + Packet8us p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const short int* from) +{ + Packet8s p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const unsigned short int* from) +{ + Packet8us p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploadquad(const bfloat16* from) +{ + return ploadquad(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const signed char* from) +{ + Packet16c p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE8_HI); +} + +template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const unsigned char* from) +{ + Packet16uc p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE8_HI); +} + +template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE +#ifdef _BIG_ENDIAN // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -479,45 +1080,69 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& f MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second +#else + vec_xst(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + pstoreu_common(to, from); } -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) +template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); + pstoreu_common(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + pstoreu_common(to, from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet8bf& from) +{ + pstoreu_common(reinterpret_cast(to), from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(signed char* to, const Packet16c& from) +{ + pstoreu_common(to, from); +} +template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* to, const Packet16uc& from) +{ + pstoreu_common(to, from); } -#endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } + +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) { + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x; + vec_ste(a, 0, &x); + return x; +} + +template<> EIGEN_STRONG_INLINE short int pfirst(const Packet8s& a) { + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE unsigned short int pfirst(const Packet8us& a) { + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE signed char pfirst(const Packet16c& a) +{ + return pfirst_common(a); +} + +template<> EIGEN_STRONG_INLINE unsigned char pfirst(const Packet16uc& a) +{ + return pfirst_common(a); +} template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -525,10 +1150,296 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); +} +template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); +} +template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) +{ + return vec_perm(a, a, p16uc_REVERSE8); +} +template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) +{ + return vec_perm(a, a, p16uc_REVERSE8); +} +template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) +{ + return preverse(a); +} template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { + _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); + return pand(p8us_abs_mask, a); +} + +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) +{ return vec_sra(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) +{ return vec_sr(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) +{ return vec_sl(a,reinterpret_cast(pset1(N))); } +template EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + Packet4ui r = vec_sl(reinterpret_cast(a), p4ui_mask); + return reinterpret_cast(r); +} + +template EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + Packet4ui r = vec_sr(reinterpret_cast(a), p4ui_mask); + return reinterpret_cast(r); +} + +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + return vec_sr(a, p4ui_mask); +} + +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); + return vec_sl(a, p4ui_mask); +} + +template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + return vec_sl(a, p8us_mask); +} +template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) +{ + const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); + return vec_sr(a, p8us_mask); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){ + return plogical_shift_left<16>(reinterpret_cast(bf.m_val)); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + return pand( + reinterpret_cast(bf.m_val), + reinterpret_cast(p4ui_high_mask) + ); +} + +// Simple interleaving of bool masks, prevents true values from being +// converted to NaNs. +EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) { + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); + Packet4f bf_odd, bf_even; + bf_odd = pand(reinterpret_cast(p4ui_high_mask), odd); + bf_even = plogical_shift_right<16>(even); + return reinterpret_cast(por(bf_even, bf_odd)); +} + +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ + Packet4ui input = reinterpret_cast(p4f); + Packet4ui lsb = plogical_shift_right<16>(input); + lsb = pand(lsb, reinterpret_cast(p4i_ONE)); + + _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); + Packet4ui rounding_bias = padd(lsb, p4ui_BIAS); + input = padd(input, rounding_bias); + + //Test NaN and Subnormal - Begin + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); + Packet4ui exp = pand(p4ui_exp_mask, reinterpret_cast(p4f)); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); + Packet4ui mantissa = pand(p4ui_mantissa_mask, reinterpret_cast(p4f)); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); + Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); + Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast(p4i_ZERO)); + + Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast(p4i_ZERO)); + Packet4ui nan_selector = pandnot( + reinterpret_cast(is_max_exp), + reinterpret_cast(is_mant_zero) + ); + + Packet4ui subnormal_selector = pandnot( + reinterpret_cast(is_zero_exp), + reinterpret_cast(is_mant_zero) + ); + + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); + input = vec_sel(input, p4ui_nan, nan_selector); + input = vec_sel(input, reinterpret_cast(p4f), subnormal_selector); + //Test NaN and Subnormal - End + + input = plogical_shift_right<16>(input); + return reinterpret_cast(input); +} + +EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){ + Packet4f bf_odd, bf_even; + bf_odd = reinterpret_cast(F32ToBf16(odd).m_val); + bf_odd = plogical_shift_left<16>(bf_odd); + bf_even = reinterpret_cast(F32ToBf16(even).m_val); + return reinterpret_cast(por(bf_even, bf_odd)); +} +#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f op_even = OP(a_even);\ + Packet4f op_odd = OP(a_odd);\ + return F32ToBf16(op_even, op_odd);\ + +#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f b_even = Bf16ToF32Even(B);\ + Packet4f b_odd = Bf16ToF32Odd(B);\ + Packet4f op_even = OP(a_even, b_even);\ + Packet4f op_odd = OP(a_odd, b_odd);\ + return F32ToBf16(op_even, op_odd);\ + +#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \ + Packet4f a_even = Bf16ToF32Even(A);\ + Packet4f a_odd = Bf16ToF32Odd(A);\ + Packet4f b_even = Bf16ToF32Even(B);\ + Packet4f b_odd = Bf16ToF32Odd(B);\ + Packet4f op_even = OP(a_even, b_even);\ + Packet4f op_odd = OP(a_odd, b_odd);\ + return F32ToBf16Bool(op_even, op_odd);\ + +template<> EIGEN_STRONG_INLINE Packet8bf padd(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(padd, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmul(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmul, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pdiv(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pdiv, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) { + BF16_TO_F32_UNARY_OP_WRAPPER(pnegate, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a,exponent); +} +template<> EIGEN_STRONG_INLINE Packet8bf pldexp (const Packet8bf& a, const Packet8bf& exponent){ + BF16_TO_F32_BINARY_OP_WRAPPER(pldexp, a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a,exponent); +} +template<> EIGEN_STRONG_INLINE Packet8bf pfrexp (const Packet8bf& a, Packet8bf& e){ + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f e_even; + Packet4f e_odd; + Packet4f op_even = pfrexp(a_even, e_even); + Packet4f op_odd = pfrexp(a_odd, e_odd); + e = F32ToBf16(e_even, e_odd); + return F32ToBf16(op_even, op_odd); +} + +template<> EIGEN_STRONG_INLINE Packet8bf psin (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcos (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf plog (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pfloor (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pfloor, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pceil (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pceil, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pround (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pround, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf print (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(print, a); +} +template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { + Packet4f a_even = Bf16ToF32Even(a); + Packet4f a_odd = Bf16ToF32Odd(a); + Packet4f b_even = Bf16ToF32Even(b); + Packet4f b_odd = Bf16ToF32Odd(b); + Packet4f c_even = Bf16ToF32Even(c); + Packet4f c_odd = Bf16ToF32Odd(c); + Packet4f pmadd_even = pmadd(a_even, b_even, c_even); + Packet4f pmadd_odd = pmadd(a_odd, b_odd, c_odd); + return F32ToBf16(pmadd_even, pmadd_odd); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmin(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmin, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER(pmax, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le, a, b); +} +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq, a, b); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) { + return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst(a))); +} + +template<> EIGEN_STRONG_INLINE Packet8bf ploaddup(const bfloat16* from) +{ + return ploaddup(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet8bf plset(const bfloat16& a) { + bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3), + bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) }; + return padd(pset1(a), pload(countdown)); +} template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { @@ -540,181 +1451,389 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i sum; + sum = vec_sums(a, p4i_ZERO); +#ifdef _BIG_ENDIAN + sum = vec_sld(sum, p4i_ZERO, 12); +#else + sum = vec_sld(p4i_ZERO, sum, 4); +#endif + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) +{ + float redux_even = predux(Bf16ToF32Even(a)); + float redux_odd = predux(Bf16ToF32Odd(a)); + float f32_result = redux_even + redux_odd; + return bfloat16(f32_result); +} +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) +{ + union{ + Packet v; + __UNPACK_TYPE__(Packet) n[8]; + } vt; + vt.v = a; + + EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; + EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; + Packet4i first_half = pload(first_loader); + Packet4i second_half = pload(second_loader); + + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half)); +} + +template<> EIGEN_STRONG_INLINE short int predux(const Packet8s& a) +{ + return predux_size8(a); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux(const Packet8us& a) +{ + return predux_size8(a); +} + +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) +{ + union{ + Packet v; + __UNPACK_TYPE__(Packet) n[16]; + } vt; + vt.v = a; + + EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; + EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; + EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; + EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; + + Packet4i first_quarter = pload(first_loader); + Packet4i second_quarter = pload(second_loader); + Packet4i third_quarter = pload(third_loader); + Packet4i fourth_quarter = pload(fourth_loader); + + return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + + predux(third_quarter) + predux(fourth_quarter)); +} + +template<> EIGEN_STRONG_INLINE signed char predux(const Packet16c& a) +{ + return predux_size16(a); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux(const Packet16uc& a) +{ + return predux_size16(a); +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} + +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE short int predux_mul(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_mul(const Packet8us& a) +{ + Packet8us pair, quad, octo; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) +{ + float redux_even = predux_mul(Bf16ToF32Even(a)); + float redux_odd = predux_mul(Bf16ToF32Odd(a)); + float f32_result = redux_even * redux_odd; + return bfloat16(f32_result); +} + + +template<> EIGEN_STRONG_INLINE signed char predux_mul(const Packet16c& a) +{ + Packet16c pair, quad, octo, result; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + result = vec_mul(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux_mul(const Packet16uc& a) +{ + Packet16uc pair, quad, octo, result; + + pair = vec_mul(a, vec_sld(a, a, 8)); + quad = vec_mul(pair, vec_sld(pair, pair, 4)); + octo = vec_mul(quad, vec_sld(quad, quad, 2)); + result = vec_mul(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +// min +template EIGEN_STRONG_INLINE +__UNPACK_TYPE__(Packet) predux_min4(const Packet& a) +{ + Packet b, res; + b = vec_min(a, vec_sld(a, a, 8)); + res = vec_min(b, vec_sld(b, b, 4)); + return pfirst(res); +} + + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + return predux_min4(a); +} + +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + return predux_min4(a); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) +{ + float redux_even = predux_min(Bf16ToF32Even(a)); + float redux_odd = predux_min(Bf16ToF32Odd(a)); + float f32_result = (std::min)(redux_even, redux_odd); + return bfloat16(f32_result); +} + +template<> EIGEN_STRONG_INLINE short int predux_min(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } + pair = vec_min(a, vec_sld(a, a, 8)); + + //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } + quad = vec_min(pair, vec_sld(pair, pair, 4)); + + //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_min(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_min(const Packet8us& a) +{ + Packet8us pair, quad, octo; + + //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } + pair = vec_min(a, vec_sld(a, a, 8)); + + //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } + quad = vec_min(pair, vec_sld(pair, pair, 4)); + + //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_min(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE signed char predux_min(const Packet16c& a) +{ + Packet16c pair, quad, octo, result; + + pair = vec_min(a, vec_sld(a, a, 8)); + quad = vec_min(pair, vec_sld(pair, pair, 4)); + octo = vec_min(quad, vec_sld(quad, quad, 2)); + result = vec_min(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} + +template<> EIGEN_STRONG_INLINE unsigned char predux_min(const Packet16uc& a) +{ + Packet16uc pair, quad, octo, result; + + pair = vec_min(a, vec_sld(a, a, 8)); + quad = vec_min(pair, vec_sld(pair, pair, 4)); + octo = vec_min(quad, vec_sld(quad, quad, 2)); + result = vec_min(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); +} +// max +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) +{ + Packet b, res; + b = vec_max(a, vec_sld(a, a, 8)); + res = vec_max(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + return predux_max4(a); +} + +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + return predux_max4(a); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) +{ + float redux_even = predux_max(Bf16ToF32Even(a)); + float redux_odd = predux_max(Bf16ToF32Odd(a)); + float f32_result = (std::max)(redux_even, redux_odd); + return bfloat16(f32_result); +} + +template<> EIGEN_STRONG_INLINE short int predux_max(const Packet8s& a) +{ + Packet8s pair, quad, octo; + + //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } + pair = vec_max(a, vec_sld(a, a, 8)); + + //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } + quad = vec_max(pair, vec_sld(pair, pair, 4)); + + //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_max(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); +} + +template<> EIGEN_STRONG_INLINE unsigned short int predux_max(const Packet8us& a) { - Packet4f v[4], sum[4]; + Packet8us pair, quad, octo; + + //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } + pair = vec_max(a, vec_sld(a, a, 8)); - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); + //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } + quad = vec_max(pair, vec_sld(pair, pair, 4)); - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; + //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } + octo = vec_max(quad, vec_sld(quad, quad, 2)); + return pfirst(octo); } -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE signed char predux_max(const Packet16c& a) { - Packet4i sum; - sum = vec_sums(a, p4i_ZERO); -#ifdef _BIG_ENDIAN - sum = vec_sld(sum, p4i_ZERO, 12); -#else - sum = vec_sld(p4i_ZERO, sum, 4); -#endif - return pfirst(sum); + Packet16c pair, quad, octo, result; + + pair = vec_max(a, vec_sld(a, a, 8)); + quad = vec_max(pair, vec_sld(pair, pair, 4)); + octo = vec_max(quad, vec_sld(quad, quad, 2)); + result = vec_max(octo, vec_sld(octo, octo, 1)); + + return pfirst(result); } -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +template<> EIGEN_STRONG_INLINE unsigned char predux_max(const Packet16uc& a) { - Packet4i v[4], sum[4]; + Packet16uc pair, quad, octo, result; - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); + pair = vec_max(a, vec_sld(a, a, 8)); + quad = vec_max(pair, vec_sld(pair, pair, 4)); + octo = vec_max(quad, vec_sld(quad, quad, 2)); + result = vec_max(octo, vec_sld(octo, octo, 1)); - // Now do the summation: - // Lines 0+1 - sum[0] = sum[0] + sum[1]; - // Lines 2+3 - sum[1] = sum[2] + sum[3]; - // Add the results - sum[0] = sum[0] + sum[1]; - - return sum[0]; + return pfirst(result); } -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - Packet4f prod; - prod = pmul(a, vec_sld(a, a, 8)); - return pfirst(pmul(prod, vec_sld(prod, prod, 4))); + return vec_any_ne(x, pzero(x)); } -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) -{ - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return aux[0] * aux[1] * aux[2] * aux[3]; +template EIGEN_DEVICE_FUNC inline void +ptranpose_common(PacketBlock& kernel){ + T t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); } -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet4f b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranpose_common(kernel); } -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) -{ - Packet4i b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + ptranpose_common(kernel); } -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8s t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); } -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) -{ - Packet4i b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8us t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { -#ifdef _BIG_ENDIAN - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } -#else - switch (Offset % 4) { - case 1: - first = vec_sld(second, first, 12); break; - case 2: - first = vec_sld(second, first, 8); break; - case 3: - first = vec_sld(second, first, 4); break; - } -#endif - } -}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8us t0, t1, t2, t3; + + t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val); + t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val); + t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val); + t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; +ptranspose(PacketBlock& kernel) { + Packet16c t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -725,9 +1844,10 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } + EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; +ptranspose(PacketBlock& kernel) { + Packet16uc t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -738,18 +1858,398 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8s v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); + v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); + v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); + v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); + v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); + v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); + v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); + v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); + sum[0] = vec_mergeh(v[0], v[4]); + sum[1] = vec_mergel(v[0], v[4]); + sum[2] = vec_mergeh(v[1], v[5]); + sum[3] = vec_mergel(v[1], v[5]); + sum[4] = vec_mergeh(v[2], v[6]); + sum[5] = vec_mergel(v[2], v[6]); + sum[6] = vec_mergeh(v[3], v[7]); + sum[7] = vec_mergel(v[3], v[7]); + + kernel.packet[0] = vec_mergeh(sum[0], sum[4]); + kernel.packet[1] = vec_mergel(sum[0], sum[4]); + kernel.packet[2] = vec_mergeh(sum[1], sum[5]); + kernel.packet[3] = vec_mergel(sum[1], sum[5]); + kernel.packet[4] = vec_mergeh(sum[2], sum[6]); + kernel.packet[5] = vec_mergel(sum[2], sum[6]); + kernel.packet[6] = vec_mergeh(sum[3], sum[7]); + kernel.packet[7] = vec_mergel(sum[3], sum[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8us v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); + v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); + v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); + v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); + v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); + v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); + v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); + v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); + sum[0] = vec_mergeh(v[0], v[4]); + sum[1] = vec_mergel(v[0], v[4]); + sum[2] = vec_mergeh(v[1], v[5]); + sum[3] = vec_mergel(v[1], v[5]); + sum[4] = vec_mergeh(v[2], v[6]); + sum[5] = vec_mergel(v[2], v[6]); + sum[6] = vec_mergeh(v[3], v[7]); + sum[7] = vec_mergel(v[3], v[7]); + + kernel.packet[0] = vec_mergeh(sum[0], sum[4]); + kernel.packet[1] = vec_mergel(sum[0], sum[4]); + kernel.packet[2] = vec_mergeh(sum[1], sum[5]); + kernel.packet[3] = vec_mergel(sum[1], sum[5]); + kernel.packet[4] = vec_mergeh(sum[2], sum[6]); + kernel.packet[5] = vec_mergel(sum[2], sum[6]); + kernel.packet[6] = vec_mergeh(sum[3], sum[7]); + kernel.packet[7] = vec_mergel(sum[3], sum[7]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet8bf v[8], sum[8]; + + v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val); + v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val); + v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val); + v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val); + v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val); + v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val); + v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val); + v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val); + sum[0] = vec_mergeh(v[0].m_val, v[4].m_val); + sum[1] = vec_mergel(v[0].m_val, v[4].m_val); + sum[2] = vec_mergeh(v[1].m_val, v[5].m_val); + sum[3] = vec_mergel(v[1].m_val, v[5].m_val); + sum[4] = vec_mergeh(v[2].m_val, v[6].m_val); + sum[5] = vec_mergel(v[2].m_val, v[6].m_val); + sum[6] = vec_mergeh(v[3].m_val, v[7].m_val); + sum[7] = vec_mergel(v[3].m_val, v[7].m_val); + + kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val); + kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val); + kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val); + kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val); + kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val); + kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val); + kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val); + kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet16c step1[16], step2[16], step3[16]; + + step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); + step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); + step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); + step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); + step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); + step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); + step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); + step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); + step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); + step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); + step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); + step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); + step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); + step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); + step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); + step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); + + step2[0] = vec_mergeh(step1[0], step1[8]); + step2[1] = vec_mergel(step1[0], step1[8]); + step2[2] = vec_mergeh(step1[1], step1[9]); + step2[3] = vec_mergel(step1[1], step1[9]); + step2[4] = vec_mergeh(step1[2], step1[10]); + step2[5] = vec_mergel(step1[2], step1[10]); + step2[6] = vec_mergeh(step1[3], step1[11]); + step2[7] = vec_mergel(step1[3], step1[11]); + step2[8] = vec_mergeh(step1[4], step1[12]); + step2[9] = vec_mergel(step1[4], step1[12]); + step2[10] = vec_mergeh(step1[5], step1[13]); + step2[11] = vec_mergel(step1[5], step1[13]); + step2[12] = vec_mergeh(step1[6], step1[14]); + step2[13] = vec_mergel(step1[6], step1[14]); + step2[14] = vec_mergeh(step1[7], step1[15]); + step2[15] = vec_mergel(step1[7], step1[15]); + + step3[0] = vec_mergeh(step2[0], step2[8]); + step3[1] = vec_mergel(step2[0], step2[8]); + step3[2] = vec_mergeh(step2[1], step2[9]); + step3[3] = vec_mergel(step2[1], step2[9]); + step3[4] = vec_mergeh(step2[2], step2[10]); + step3[5] = vec_mergel(step2[2], step2[10]); + step3[6] = vec_mergeh(step2[3], step2[11]); + step3[7] = vec_mergel(step2[3], step2[11]); + step3[8] = vec_mergeh(step2[4], step2[12]); + step3[9] = vec_mergel(step2[4], step2[12]); + step3[10] = vec_mergeh(step2[5], step2[13]); + step3[11] = vec_mergel(step2[5], step2[13]); + step3[12] = vec_mergeh(step2[6], step2[14]); + step3[13] = vec_mergel(step2[6], step2[14]); + step3[14] = vec_mergeh(step2[7], step2[15]); + step3[15] = vec_mergel(step2[7], step2[15]); + + kernel.packet[0] = vec_mergeh(step3[0], step3[8]); + kernel.packet[1] = vec_mergel(step3[0], step3[8]); + kernel.packet[2] = vec_mergeh(step3[1], step3[9]); + kernel.packet[3] = vec_mergel(step3[1], step3[9]); + kernel.packet[4] = vec_mergeh(step3[2], step3[10]); + kernel.packet[5] = vec_mergel(step3[2], step3[10]); + kernel.packet[6] = vec_mergeh(step3[3], step3[11]); + kernel.packet[7] = vec_mergel(step3[3], step3[11]); + kernel.packet[8] = vec_mergeh(step3[4], step3[12]); + kernel.packet[9] = vec_mergel(step3[4], step3[12]); + kernel.packet[10] = vec_mergeh(step3[5], step3[13]); + kernel.packet[11] = vec_mergel(step3[5], step3[13]); + kernel.packet[12] = vec_mergeh(step3[6], step3[14]); + kernel.packet[13] = vec_mergel(step3[6], step3[14]); + kernel.packet[14] = vec_mergeh(step3[7], step3[15]); + kernel.packet[15] = vec_mergel(step3[7], step3[15]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet16uc step1[16], step2[16], step3[16]; + + step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); + step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); + step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); + step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); + step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); + step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); + step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); + step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); + step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); + step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); + step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); + step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); + step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); + step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); + step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); + step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); + + step2[0] = vec_mergeh(step1[0], step1[8]); + step2[1] = vec_mergel(step1[0], step1[8]); + step2[2] = vec_mergeh(step1[1], step1[9]); + step2[3] = vec_mergel(step1[1], step1[9]); + step2[4] = vec_mergeh(step1[2], step1[10]); + step2[5] = vec_mergel(step1[2], step1[10]); + step2[6] = vec_mergeh(step1[3], step1[11]); + step2[7] = vec_mergel(step1[3], step1[11]); + step2[8] = vec_mergeh(step1[4], step1[12]); + step2[9] = vec_mergel(step1[4], step1[12]); + step2[10] = vec_mergeh(step1[5], step1[13]); + step2[11] = vec_mergel(step1[5], step1[13]); + step2[12] = vec_mergeh(step1[6], step1[14]); + step2[13] = vec_mergel(step1[6], step1[14]); + step2[14] = vec_mergeh(step1[7], step1[15]); + step2[15] = vec_mergel(step1[7], step1[15]); + + step3[0] = vec_mergeh(step2[0], step2[8]); + step3[1] = vec_mergel(step2[0], step2[8]); + step3[2] = vec_mergeh(step2[1], step2[9]); + step3[3] = vec_mergel(step2[1], step2[9]); + step3[4] = vec_mergeh(step2[2], step2[10]); + step3[5] = vec_mergel(step2[2], step2[10]); + step3[6] = vec_mergeh(step2[3], step2[11]); + step3[7] = vec_mergel(step2[3], step2[11]); + step3[8] = vec_mergeh(step2[4], step2[12]); + step3[9] = vec_mergel(step2[4], step2[12]); + step3[10] = vec_mergeh(step2[5], step2[13]); + step3[11] = vec_mergel(step2[5], step2[13]); + step3[12] = vec_mergeh(step2[6], step2[14]); + step3[13] = vec_mergel(step2[6], step2[14]); + step3[14] = vec_mergeh(step2[7], step2[15]); + step3[15] = vec_mergel(step2[7], step2[15]); + + kernel.packet[0] = vec_mergeh(step3[0], step3[8]); + kernel.packet[1] = vec_mergel(step3[0], step3[8]); + kernel.packet[2] = vec_mergeh(step3[1], step3[9]); + kernel.packet[3] = vec_mergel(step3[1], step3[9]); + kernel.packet[4] = vec_mergeh(step3[2], step3[10]); + kernel.packet[5] = vec_mergel(step3[2], step3[10]); + kernel.packet[6] = vec_mergeh(step3[3], step3[11]); + kernel.packet[7] = vec_mergel(step3[3], step3[11]); + kernel.packet[8] = vec_mergeh(step3[4], step3[12]); + kernel.packet[9] = vec_mergel(step3[4], step3[12]); + kernel.packet[10] = vec_mergeh(step3[5], step3[13]); + kernel.packet[11] = vec_mergel(step3[5], step3[13]); + kernel.packet[12] = vec_mergeh(step3[6], step3[14]); + kernel.packet[13] = vec_mergel(step3[6], step3[14]); + kernel.packet[14] = vec_mergeh(step3[7], step3[15]); + kernel.packet[15] = vec_mergel(step3[7], step3[15]); +} + +template EIGEN_STRONG_INLINE +Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); return vec_sel(elsePacket, thenPacket, mask); } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + return pblend4(ifPacket, thenPacket, elsePacket); +} + template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return pblend4(ifPacket, thenPacket, elsePacket); +} + +template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { + Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; + Packet8us mask = reinterpret_cast(vec_cmpeq(select, p8us_ONE)); + Packet8s result = vec_sel(elsePacket, thenPacket, mask); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { + Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; + Packet8us mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p8us_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) { + return pblend(ifPacket, thenPacket, elsePacket); +} + +template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) { + Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], + ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], + ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; + + Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) { + Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], + ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], + ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], + ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; + + Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vec_cts(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vec_ctu(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vec_ctf(a,0); +} + +template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8bf& a) { + Packet4f float_even = Bf16ToF32Even(a); + Packet4f float_odd = Bf16ToF32Odd(a); + Packet4ui int_even = pcast(float_even); + Packet4ui int_odd = pcast(float_odd); + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui low_even = pand(int_even, p4ui_low_mask); + Packet4ui low_odd = pand(int_odd, p4ui_low_mask); + + //Check values that are bigger than USHRT_MAX (0xFFFF) + Packet4bi overflow_selector; + if(vec_any_gt(int_even, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); + low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + if(vec_any_gt(int_odd, p4ui_low_mask)){ + overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); + low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); + } + + low_odd = plogical_shift_left<16>(low_odd); + + Packet4ui int_final = por(low_even, low_odd); + return reinterpret_cast(int_final); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pcast(const Packet8us& a) { + //short -> int -> float -> bfloat16 + const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); + Packet4ui int_cast = reinterpret_cast(a); + Packet4ui int_even = pand(int_cast, p4ui_low_mask); + Packet4ui int_odd = plogical_shift_right<16>(int_cast); + Packet4f float_even = pcast(int_even); + Packet4f float_odd = pcast(int_odd); + return F32ToBf16(float_even, float_odd); +} + + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return reinterpret_cast(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return reinterpret_cast(a); +} + + //---------- double ---------- #ifdef __VSX__ @@ -764,9 +2264,12 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; +static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); -static Packet2d p2d_MZERO = { -0.0, -0.0 }; +static Packet2d p2d_MZERO = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -774,16 +2277,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_c static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ONE), reinterpret_cast(p2d_ZERO), 8)); #endif -template Packet2d vec_splat_dbl(Packet2d& a); - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) -{ - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_HI)); -} - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) +template Packet2d vec_splat_dbl(Packet2d& a) { - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_LO)); + return vec_splat(a, index); } template<> struct packet_traits : default_packet_traits @@ -812,12 +2308,13 @@ template<> struct packet_traits : default_packet_traits HasRound = 1, HasFloor = 1, HasCeil = 1, + HasRint = 1, HasNegate = 1, HasBlend = 1 }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { @@ -845,21 +2342,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return vec_xl(0, const_cast(from)); // cast needed by Clang } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { @@ -867,28 +2356,32 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return v; } +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(unsigned long from) { + Packet2l v = {static_cast(from), static_cast(from)}; + return reinterpret_cast(v); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { - a1 = pload(a); - a0 = vec_splat_dbl<0>(a1); - a1 = vec_splat_dbl<1>(a1); - a3 = pload(a+2); - a2 = vec_splat_dbl<0>(a3); - a3 = vec_splat_dbl<1>(a3); + //This way is faster than vec_splat (at least for doubles in Power 9) + a0 = pset1(a[0]); + a1 = pset1(a[1]); + a2 = pset1(a[2]); + a3 = pset1(a[3]); } template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -910,9 +2403,29 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) +{ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN + Packet2d ret; + __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + } + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) +{ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN + Packet2d ret; + __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; +} -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { + Packet2d c = reinterpret_cast(vec_cmpge(a,b)); + return vec_nor(c,c); +} template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } @@ -922,14 +2435,34 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + Packet2d t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p2ul_SIGN), p2ul_PREV0DOT5)), a); + Packet2d res; + + __asm__("xvrdpiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); + + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ + Packet2d res; + + __asm__("xvrdpic %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (a)); + + return res; +} template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD - return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -942,13 +2475,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + EIGEN_DEBUG_UNALIGNED_STORE + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { @@ -956,6 +2489,177 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +template<> +inline Packet2l pcast(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} + +template<> +inline Packet2d pcast(const Packet2l& x) { + unsigned long long tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2d d = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return d; +} + + +// Packet2l shifts. +// For POWER8 we simply use vec_sr/l. +// +// Things are more complicated for POWER7. There is actually a +// vec_xxsxdi intrinsic but it is not supported by some gcc versions. +// So we need to shift by N % 32 and rearrage bytes. +#ifdef __POWER8_VECTOR__ + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sl(a, shift); +} + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sr(a, shift); +} + +#else + +// Shifts [A, B, C, D] to [B, 0, D, 0]. +// Used to implement left shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) { + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +// Shifts [A, B, C, D] to [0, A, 0, C]. +// Used to implement right shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) { + static const Packet16uc perm = { + 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +template +struct plogical_shift_left_impl; + +template +struct plogical_shift_left_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_right = {m, m, m, m}; + const Packet4i out_hi = vec_sl(ai, shift); + const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_left_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_even_left(vec_sl(ai, shift))); + } +}; + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + return plogical_shift_left_impl::run(a); +} + +template +struct plogical_shift_right_impl; + +template +struct plogical_shift_right_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_left = {m, m, m, m}; + const Packet4i out_lo = vec_sr(ai, shift); + const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_right_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_odd_right(vec_sr(ai, shift))); + } +}; + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + return plogical_shift_right_impl::run(a); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2l e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply: + const Packet2l bias = { 1023, 1023 }; + Packet2l b = plogical_shift_right<2>(e); // floor(e/4) + Packet2d c = reinterpret_cast(plogical_shift_left<52>(b + bias)); + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = reinterpret_cast(plogical_shift_left<52>(b + bias)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; +} + + +// Extract exponent without existence of Packet2l. +template<> +EIGEN_STRONG_INLINE +Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { + return pcast(plogical_shift_right<52>(reinterpret_cast(pabs(a)))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp (const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} + template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { Packet2d b, sum; @@ -964,20 +2668,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - Packet2d v[2], sum; - v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); - v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - -#ifdef _BIG_ENDIAN - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); -#else - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[1]), reinterpret_cast(v[0]), 8)); -#endif - - return sum; -} // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) @@ -997,20 +2687,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) -#ifdef _BIG_ENDIAN - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); -#else - first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); -#endif - } -}; - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet2d t0, t1; @@ -1022,9 +2698,11 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } + + #endif // __VSX__ } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/Complex.h b/externals/eigen/Eigen/src/Core/arch/CUDA/Complex.h index 9c253650..deb4c869 100644 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/CUDA/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2021 C. Antonio Sanchez // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -11,93 +12,247 @@ #define EIGEN_COMPLEX_CUDA_H // clang-format off +// Many std::complex methods such as operator+, operator-, operator* and +// operator/ are not constexpr. Due to this, GCC and older versions of clang do +// not treat them as device functions and thus Eigen functors making use of +// these operators fail to compile. Here, we manually specialize these +// operators and functors for complex types when building for CUDA to enable +// their use on-device. + +#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) + +// ICC already specializes std::complex and std::complex +// operators, preventing us from making them device functions here. +// This will lead to silent runtime errors if the operators are used on device. +// +// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_ +// prior to first inclusion of . This prevents ICC from adding +// its own specializations, so our custom ones below can be used instead. +#if !(defined(EIGEN_COMP_ICC) && defined(_USE_COMPLEX_SPECIALIZATION_)) + +// Import Eigen's internal operator specializations. +#define EIGEN_USING_STD_COMPLEX_OPERATORS \ + using Eigen::complex_operator_detail::operator+; \ + using Eigen::complex_operator_detail::operator-; \ + using Eigen::complex_operator_detail::operator*; \ + using Eigen::complex_operator_detail::operator/; \ + using Eigen::complex_operator_detail::operator+=; \ + using Eigen::complex_operator_detail::operator-=; \ + using Eigen::complex_operator_detail::operator*=; \ + using Eigen::complex_operator_detail::operator/=; \ + using Eigen::complex_operator_detail::operator==; \ + using Eigen::complex_operator_detail::operator!=; namespace Eigen { -namespace internal { +// Specialized std::complex overloads. +namespace complex_operator_detail { -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_multiply(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + return std::complex( + a_real * b_real - a_imag * b_imag, + a_imag * b_real + a_real * b_imag); +} -// Many std::complex methods such as operator+, operator-, operator* and -// operator/ are not constexpr. Due to this, clang does not treat them as device -// functions and thus Eigen functors making use of these operators fail to -// compile. Here, we manually specialize these functors for complex types when -// building for CUDA to avoid non-constexpr methods. - -// Sum -template struct scalar_sum_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) + numext::real(b), - numext::imag(a) + numext::imag(b)); - } -}; - -template struct scalar_sum_op, std::complex > : scalar_sum_op, const std::complex > {}; - - -// Difference -template struct scalar_difference_op, const std::complex > : binary_op_base, const std::complex > { - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - return std::complex(numext::real(a) - numext::real(b), - numext::imag(a) - numext::imag(b)); - } -}; - -template struct scalar_difference_op, std::complex > : scalar_difference_op, const std::complex > {}; - - -// Product -template struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasMul - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - return std::complex(a_real * b_real - a_imag * b_imag, - a_real * b_imag + a_imag * b_real); - } -}; - -template struct scalar_product_op, std::complex > : scalar_product_op, const std::complex > {}; - - -// Quotient -template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { - enum { - Vectorizable = packet_traits>::HasDiv - }; - typedef typename std::complex result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex operator() (const std::complex& a, const std::complex& b) const { - const T a_real = numext::real(a); - const T a_imag = numext::imag(a); - const T b_real = numext::real(b); - const T b_imag = numext::imag(b); - const T norm = T(1) / (b_real * b_real + b_imag * b_imag); - return std::complex((a_real * b_real + a_imag * b_imag) * norm, - (a_imag * b_real - a_real * b_imag) * norm); - } -}; - -template struct scalar_quotient_op, std::complex > : scalar_quotient_op, const std::complex > {}; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide_fast(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + const T norm = (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) / norm, + (a_imag * b_real - a_real * b_imag) / norm); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide_stable(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf), + // guards against over/under-flow. + const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real); + const T rscale = scale_imag ? T(1) : b_real / b_imag; + const T iscale = scale_imag ? b_imag / b_real : T(1); + const T denominator = b_real * rscale + b_imag * iscale; + return std::complex((a_real * rscale + a_imag * iscale) / denominator, + (a_imag * rscale - a_real * iscale) / denominator); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex complex_divide(const std::complex& a, const std::complex& b) { +#if EIGEN_FAST_MATH + return complex_divide_fast(a, b); +#else + return complex_divide_stable(a, b); #endif +} + +// NOTE: We cannot specialize compound assignment operators with Scalar T, +// (i.e. operator@=(const T&), for @=+,-,*,/) +// since they are already specialized for float/double/long double within +// the standard header. We also do not specialize the stream +// operators. +#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T) \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a) { return a; } \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a) { \ + return std::complex(-numext::real(a), -numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a, const std::complex& b) { \ + return std::complex(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) + b, numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator+(const T& a, const std::complex& b) { \ + return std::complex(a + numext::real(b), numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a, const std::complex& b) { \ + return std::complex(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) - b, numext::imag(a)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator-(const T& a, const std::complex& b) { \ + return std::complex(a - numext::real(b), -numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const std::complex& a, const std::complex& b) { \ + return complex_multiply(a, b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) * b, numext::imag(a) * b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator*(const T& a, const std::complex& b) { \ + return std::complex(a * numext::real(b), a * numext::imag(b)); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const std::complex& a, const std::complex& b) { \ + return complex_divide(a, b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const std::complex& a, const T& b) { \ + return std::complex(numext::real(a) / b, numext::imag(a) / b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex operator/(const T& a, const std::complex& b) { \ + return complex_divide(std::complex(a, 0), b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator+=(std::complex& a, const std::complex& b) { \ + numext::real_ref(a) += numext::real(b); \ + numext::imag_ref(a) += numext::imag(b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator-=(std::complex& a, const std::complex& b) { \ + numext::real_ref(a) -= numext::real(b); \ + numext::imag_ref(a) -= numext::imag(b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator*=(std::complex& a, const std::complex& b) { \ + a = complex_multiply(a, b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +std::complex& operator/=(std::complex& a, const std::complex& b) { \ + a = complex_divide(a, b); \ + return a; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const std::complex& a, const std::complex& b) { \ + return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const std::complex& a, const T& b) { \ + return numext::real(a) == b && numext::imag(a) == 0; \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator==(const T& a, const std::complex& b) { \ + return a == numext::real(b) && 0 == numext::imag(b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const std::complex& a, const std::complex& b) { \ + return !(a == b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const std::complex& a, const T& b) { \ + return !(a == b); \ +} \ + \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +bool operator!=(const T& a, const std::complex& b) { \ + return !(a == b); \ +} + +// Do not specialize for long double, since that reduces to double on device. +EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float) +EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double) + +#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS + + +} // namespace complex_operator_detail + +EIGEN_USING_STD_COMPLEX_OPERATORS + +namespace numext { +EIGEN_USING_STD_COMPLEX_OPERATORS +} // namespace numext + +namespace internal { +EIGEN_USING_STD_COMPLEX_OPERATORS + +} // namespace internal +} // namespace Eigen -} // end namespace internal +#endif // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_) -} // end namespace Eigen +#endif // EIGEN_CUDACC && EIGEN_GPU_COMPILE_PHASE -#endif // EIGEN_COMPLEX_CUDA_H +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/Half.h b/externals/eigen/Eigen/src/Core/arch/CUDA/Half.h deleted file mode 100644 index 52892db3..00000000 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/Half.h +++ /dev/null @@ -1,585 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// The conversion routines are Copyright (c) Fabian Giesen, 2016. -// The original license follows: -// -// Copyright (c) Fabian Giesen, 2016 -// All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted. -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with -// operator overloads such that it behaves basically as an arithmetic -// type. It will be quite slow on CPUs (so it is recommended to stay -// in fp32 for CPUs, except for simple parameter conversions, I/O -// to disk and the likes), but fast on GPUs. - - -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H - -#if __cplusplus > 199711L -#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() -#else -#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() -#endif - - -namespace Eigen { - -struct half; - -namespace half_impl { - -#if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} - unsigned short x; -}; - -#endif - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); - -struct half_base : public __half { - EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} -}; - -} // namespace half_impl - -// Class definition. -struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; - #endif - - EIGEN_DEVICE_FUNC half() {} - - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} - EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} - - explicit EIGEN_DEVICE_FUNC half(bool b) - : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} - template - explicit EIGEN_DEVICE_FUNC half(const T& val) - : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} - explicit EIGEN_DEVICE_FUNC half(float f) - : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} - - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { - // +0.0 and -0.0 become false, everything else becomes true. - return (x & 0x7fff) != 0; - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { - return static_cast(half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { - return half_impl::half_to_float(*this); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return static_cast(half_impl::half_to_float(*this)); - } - - EIGEN_DEVICE_FUNC half& operator=(const half& other) { - x = other.x; - return *this; - } -}; - -namespace half_impl { - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - -// Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than fp32 arithmetic (you need to use the half2 -// versions to get the ALU speed increased), but you do save the -// conversion steps back and forth. - -__device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); -} -__device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -} -__device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); -} -__device__ half operator / (const half& a, const half& b) { - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -} -__device__ half operator - (const half& a) { - return __hneg(a); -} -__device__ half& operator += (half& a, const half& b) { - a = a + b; - return a; -} -__device__ half& operator *= (half& a, const half& b) { - a = a * b; - return a; -} -__device__ half& operator -= (half& a, const half& b) { - a = a - b; - return a; -} -__device__ half& operator /= (half& a, const half& b) { - a = a / b; - return a; -} -__device__ bool operator == (const half& a, const half& b) { - return __heq(a, b); -} -__device__ bool operator != (const half& a, const half& b) { - return __hne(a, b); -} -__device__ bool operator < (const half& a, const half& b) { - return __hlt(a, b); -} -__device__ bool operator <= (const half& a, const half& b) { - return __hle(a, b); -} -__device__ bool operator > (const half& a, const half& b) { - return __hgt(a, b); -} -__device__ bool operator >= (const half& a, const half& b) { - return __hge(a, b); -} - -#else // Emulate support for half floats - -// Definitions for CPUs and older CUDA, mostly working through conversion -// to/from fp32. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { - return half(float(a) + float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { - return half(float(a) * float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { - return half(float(a) - float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { - return half(float(a) / float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { - half result; - result.x = a.x ^ 0x8000; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { - a = half(float(a) + float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { - a = half(float(a) * float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { - a = half(float(a) - float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { - a = half(float(a) / float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return float(a) == float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return float(a) != float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { - return float(a) < float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { - return float(a) <= float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { - return float(a) > float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { - return float(a) >= float(b); -} - -#endif // Emulate support for half floats - -// Division by an index. Do it in full float precision to avoid accuracy -// issues in converting the denominator to half. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { - return half(static_cast(a) / static_cast(b)); -} - -// Conversion routines, including fallbacks for the host or older CUDA. -// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of -// these in hardware. If we need more performance on older/other CPUs, they are -// also possible to vectorize directly. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; - h.x = x; - return h; -} - -union FP32 { - unsigned int u; - float f; -}; - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(ff); - -#elif defined(EIGEN_HAS_FP16_C) - __half h; - h.x = _cvtss_sh(ff, 0); - return h; - -#else - FP32 f; f.f = ff; - - const FP32 f32infty = { 255 << 23 }; - const FP32 f16max = { (127 + 16) << 23 }; - const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; - unsigned int sign_mask = 0x80000000u; - __half o; - o.x = static_cast(0x0u); - - unsigned int sign = f.u & sign_mask; - f.u ^= sign; - - // NOTE all the integer compares in this function can be safely - // compiled into signed compares since all operands are below - // 0x80000000. Important if you want fast straight SSE2 code - // (since there's no unsigned PCMPGTD). - - if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) - o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf - } else { // (De)normalized number or zero - if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero - // use a magic value to align our 10 mantissa bits at the bottom of - // the float. as long as FP addition is round-to-nearest-even this - // just works. - f.f += denorm_magic.f; - - // and one integer subtract of the bias later, we have our final float! - o.x = static_cast(f.u - denorm_magic.u); - } else { - unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - - // update exponent, rounding bias part 1 - f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; - // rounding bias part 2 - f.u += mant_odd; - // take the bits! - o.x = static_cast(f.u >> 13); - } - } - - o.x |= static_cast(sign >> 16); - return o; -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __half2float(h); - -#elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); - -#else - const FP32 magic = { 113 << 23 }; - const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; - - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - unsigned int exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust - - // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? - o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? - o.u += 1 << 23; // extra exp adjust - o.f -= magic.f; // renormalize - } - - o.u |= (h.x & 0x8000) << 16; // sign bit - return o.f; -#endif -} - -// --- standard functions --- - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { - return (a.x & 0x7fff) == 0x7c00; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hisnan(a); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { - return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { - return half(::expf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return Eigen::half(::hlog(a)); -#else - return half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { - return half(numext::log1p(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { - return half(::log10f(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { - return half(::sqrtf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { - return half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { - return half(::sinf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { - return half(::cosf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { - return half(::tanf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { - return half(::tanhf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { - return half(::floorf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { - return half(::ceilf(float(a))); -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(b, a) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f2 < f1 ? b : a; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(a, b) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f1 < f2 ? b : a; -#endif -} - -EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { - os << static_cast(v); - return os; -} - -} // end namespace half_impl - -// import Eigen::half_impl::half into Eigen namespace -// using half_impl::half; - -namespace internal { - -template<> -struct random_default_impl -{ - static inline half run(const half& x, const half& y) - { - return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); - } - static inline half run() - { - return run(half(-1.f), half(1.f)); - } -}; - -template<> struct is_arithmetic { enum { value = true }; }; - -} // end namespace internal - -template<> struct NumTraits - : GenericNumTraits -{ - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { - return half_impl::raw_uint16_to_half(0x0800); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { - return half_impl::raw_uint16_to_half(0x7bff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { - return half_impl::raw_uint16_to_half(0xfbff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { - return half_impl::raw_uint16_to_half(0x7c00); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { - return half_impl::raw_uint16_to_half(0x7c01); - } -}; - -} // end namespace Eigen - -// C-like standard mathematical functions and trancendentals. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { - Eigen::half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { - return Eigen::half(::expf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return Eigen::half(::hlog(a)); -#else - return Eigen::half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { - return Eigen::half(::sqrtf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { - return Eigen::half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { - return Eigen::half(::floorf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { - return Eigen::half(::ceilf(float(a))); -} - -namespace std { - -#if __cplusplus > 199711L -template <> -struct hash { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { - return static_cast(a.x); - } -}; -#endif - -} // end namespace std - - -// Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 -__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - return static_cast(__shfl_xor(static_cast(var), laneMask, width)); -} -#endif - -// ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { - return Eigen::half_impl::raw_uint16_to_half( - __ldg(reinterpret_cast(ptr))); -} -#endif - - -#if defined(__CUDA_ARCH__) -namespace Eigen { -namespace numext { - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isnan)(const Eigen::half& h) { - return (half_impl::isnan)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isinf)(const Eigen::half& h) { - return (half_impl::isinf)(h); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -bool (isfinite)(const Eigen::half& h) { - return (half_impl::isfinite)(h); -} - -} // namespace Eigen -} // namespace numext -#endif - -#endif // EIGEN_HALF_CUDA_H diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h deleted file mode 100644 index ad66399e..00000000 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h +++ /dev/null @@ -1,333 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_CUDA_H -#define EIGEN_PACKET_MATH_CUDA_H - -namespace Eigen { - -namespace internal { - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef float4 type; - typedef float4 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - -template<> struct packet_traits : default_packet_traits -{ - typedef double2 type; - typedef double2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasIGamma = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - }; -}; - - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { - return make_float4(from, from, from, from); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { - return make_double2(from, from); -} - - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { - return make_float4(a, a+1, a+2, a+3); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { - return make_double2(a, a+1); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { - return make_double2(a.x+b.x, a.y+b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { - return make_double2(a.x-b.x, a.y-b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { - return make_float4(-a.x, -a.y, -a.z, -a.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { - return make_double2(-a.x, -a.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { - return make_double2(a.x*b.x, a.y*b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { - return make_double2(a.x/b.x, a.y/b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { - return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { - return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { - return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { - return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { - return make_float4(from[0], from[1], from[2], from[3]); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { - return make_double2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { - return make_float4(from[0], from[0], from[1], from[1]); -} -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { - return make_double2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { - to[0] = from.x; - to[1] = from.y; - to[2] = from.z; - to[3] = from.w; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { - to[0] = from.x; - to[1] = from.y; -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const float4*)from); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return __ldg((const double2*)from); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - return make_double2(__ldg(from+0), __ldg(from+1)); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { - return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { - return make_double2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; - to[stride*2] = from.z; - to[stride*3] = from.w; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; -} - -template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { - return a.x; -} -template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { - return a.x; -} - -template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { - return a.x + a.y + a.z + a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { - return a.x + a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { - return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { - return fmax(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { - return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { - return fmin(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { - return a.x * a.y * a.z * a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { - return a.x * a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(fabs(a.x), fabs(a.y)); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; - - tmp = kernel.packet[0].z; - kernel.packet[0].z = kernel.packet[2].x; - kernel.packet[2].x = tmp; - - tmp = kernel.packet[0].w; - kernel.packet[0].w = kernel.packet[3].x; - kernel.packet[3].x = tmp; - - tmp = kernel.packet[1].z; - kernel.packet[1].z = kernel.packet[2].y; - kernel.packet[2].y = tmp; - - tmp = kernel.packet[1].w; - kernel.packet[1].w = kernel.packet[3].y; - kernel.packet[3].y = tmp; - - tmp = kernel.packet[2].w; - kernel.packet[2].w = kernel.packet[3].z; - kernel.packet[3].z = tmp; -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - - -#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h deleted file mode 100644 index ae54225f..00000000 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ /dev/null @@ -1,1123 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H - - -namespace Eigen { -namespace internal { - -// Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef half2 type; - typedef half2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - HasAdd = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasLog = 1, - HasLog1p = 1 - }; -}; - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; - -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { - return __half2half2(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { - *reinterpret_cast(to) = from; -} - -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = __low2half(from); - to[1] = __high2half(from); -} - -template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 - return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half2 result; - result.x = a.x & 0x7FFF7FFF; - return result; -} - - -__device__ EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 - return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } - -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 - return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 - return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); -#endif -} - -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = log1pf(a1); - float r2 = log1pf(a2); - return __floats2half2_rn(r1, r2); -} - -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 - -template<> __device__ EIGEN_STRONG_INLINE -half2 plog(const half2& a) { - return h2log(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 pexp(const half2& a) { - return h2exp(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 psqrt(const half2& a) { - return h2sqrt(a); -} - -template<> __device__ EIGEN_STRONG_INLINE -half2 prsqrt(const half2& a) { - return h2rsqrt(a); -} - -#else - -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -#endif - -#elif defined EIGEN_VECTORIZE_AVX512 - -typedef struct { - __m256i x; -} Packet16h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16h type; - // There is no half-size packet for Packet16h. - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; - -template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { - Packet16h result; - result.x = _mm256_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_load_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { - Packet16h result; - result.x = _mm256_loadu_si256(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploadquad(const Eigen::half* from) { - Packet16h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtph_ps(a.x); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet16h result; - result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - Packet16h result; - result.x = _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - -template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) -{ - Packet16h result; - result.x = _mm256_set_epi16( - from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, - from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, - from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, - from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) -{ - EIGEN_ALIGN64 half aux[16]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; - to[stride*8].x = aux[8].x; - to[stride*9].x = aux[9].x; - to[stride*10].x = aux[10].x; - to[stride*11].x = aux[11].x; - to[stride*12].x = aux[12].x; - to[stride*13].x = aux[13].x; - to[stride*14].x = aux[14].x; - to[stride*15].x = aux[15].x; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m256i a = kernel.packet[0].x; - __m256i b = kernel.packet[1].x; - __m256i c = kernel.packet[2].x; - __m256i d = kernel.packet[3].x; - __m256i e = kernel.packet[4].x; - __m256i f = kernel.packet[5].x; - __m256i g = kernel.packet[6].x; - __m256i h = kernel.packet[7].x; - __m256i i = kernel.packet[8].x; - __m256i j = kernel.packet[9].x; - __m256i k = kernel.packet[10].x; - __m256i l = kernel.packet[11].x; - __m256i m = kernel.packet[12].x; - __m256i n = kernel.packet[13].x; - __m256i o = kernel.packet[14].x; - __m256i p = kernel.packet[15].x; - - __m256i ab_07 = _mm256_unpacklo_epi16(a, b); - __m256i cd_07 = _mm256_unpacklo_epi16(c, d); - __m256i ef_07 = _mm256_unpacklo_epi16(e, f); - __m256i gh_07 = _mm256_unpacklo_epi16(g, h); - __m256i ij_07 = _mm256_unpacklo_epi16(i, j); - __m256i kl_07 = _mm256_unpacklo_epi16(k, l); - __m256i mn_07 = _mm256_unpacklo_epi16(m, n); - __m256i op_07 = _mm256_unpacklo_epi16(o, p); - - __m256i ab_8f = _mm256_unpackhi_epi16(a, b); - __m256i cd_8f = _mm256_unpackhi_epi16(c, d); - __m256i ef_8f = _mm256_unpackhi_epi16(e, f); - __m256i gh_8f = _mm256_unpackhi_epi16(g, h); - __m256i ij_8f = _mm256_unpackhi_epi16(i, j); - __m256i kl_8f = _mm256_unpackhi_epi16(k, l); - __m256i mn_8f = _mm256_unpackhi_epi16(m, n); - __m256i op_8f = _mm256_unpackhi_epi16(o, p); - - __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); - __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); - __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); - __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); - __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); - __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); - __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); - __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); - - __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); - __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); - __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); - __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); - __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); - __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); - __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); - __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); - - __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); - __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); - __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); - __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); - __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); - __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); - __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); - __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); - __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); - __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); - __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); - __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); - __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); - __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); - __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); - __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); - - // NOTE: no unpacklo/hi instr in this case, so using permute instr. - __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); - __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); - - kernel.packet[0].x = a_p_0; - kernel.packet[1].x = a_p_1; - kernel.packet[2].x = a_p_2; - kernel.packet[3].x = a_p_3; - kernel.packet[4].x = a_p_4; - kernel.packet[5].x = a_p_5; - kernel.packet[6].x = a_p_6; - kernel.packet[7].x = a_p_7; - kernel.packet[8].x = a_p_8; - kernel.packet[9].x = a_p_9; - kernel.packet[10].x = a_p_a; - kernel.packet[11].x = a_p_b; - kernel.packet[12].x = a_p_c; - kernel.packet[13].x = a_p_d; - kernel.packet[14].x = a_p_e; - kernel.packet[15].x = a_p_f; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[8][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - pstore(in[4], kernel.packet[4]); - pstore(in[5], kernel.packet[5]); - pstore(in[6], kernel.packet[6]); - pstore(in[7], kernel.packet[7]); - - EIGEN_ALIGN64 half out[8][16]; - - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 8; ++j) { - out[i][j+8] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); - kernel.packet[4] = pload(out[4]); - kernel.packet[5] = pload(out[5]); - kernel.packet[6] = pload(out[6]); - kernel.packet[7] = pload(out[7]); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[4][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN64 half out[4][16]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][4*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][4*i+1]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+8] = in[j][4*i+2]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+12] = in[j][4*i+3]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -#elif defined EIGEN_VECTORIZE_AVX - -typedef struct { - __m128i x; -} Packet8h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8h type; - // There is no half-size packet for Packet8h. - typedef Packet8h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; - -template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { - Packet8h result; - result.x = _mm_set1_epi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { - Packet8h result; - result.x = _mm_load_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { - Packet8h result; - result.x = _mm_loadu_si128(reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { - _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploadquad(const Eigen::half* from) { - Packet8h result; - unsigned short a = from[0].x; - unsigned short b = from[1].x; - result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); - return result; -} - -EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtph_ps(a.x); -#else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { -#ifdef EIGEN_HAS_FP16_C - Packet8h result; - result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); - return result; -#else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - Eigen::half h4(aux[4]); - Eigen::half h5(aux[5]); - Eigen::half h6(aux[6]); - Eigen::half h7(aux[7]); - - Packet8h result; - result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); - return result; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) -{ - Packet8h result; - result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) -{ - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m128i a = kernel.packet[0].x; - __m128i b = kernel.packet[1].x; - __m128i c = kernel.packet[2].x; - __m128i d = kernel.packet[3].x; - __m128i e = kernel.packet[4].x; - __m128i f = kernel.packet[5].x; - __m128i g = kernel.packet[6].x; - __m128i h = kernel.packet[7].x; - - __m128i a03b03 = _mm_unpacklo_epi16(a, b); - __m128i c03d03 = _mm_unpacklo_epi16(c, d); - __m128i e03f03 = _mm_unpacklo_epi16(e, f); - __m128i g03h03 = _mm_unpacklo_epi16(g, h); - __m128i a47b47 = _mm_unpackhi_epi16(a, b); - __m128i c47d47 = _mm_unpackhi_epi16(c, d); - __m128i e47f47 = _mm_unpackhi_epi16(e, f); - __m128i g47h47 = _mm_unpackhi_epi16(g, h); - - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); - - __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); - __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); - __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); - __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); - __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); - __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); - __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); - __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); - - kernel.packet[0].x = a0b0c0d0e0f0g0h0; - kernel.packet[1].x = a1b1c1d1e1f1g1h1; - kernel.packet[2].x = a2b2c2d2e2f2g2h2; - kernel.packet[3].x = a3b3c3d3e3f3g3h3; - kernel.packet[4].x = a4b4c4d4e4f4g4h4; - kernel.packet[5].x = a5b5c5d5e5f5g5h5; - kernel.packet[6].x = a6b6c6d6e6f6g6h6; - kernel.packet[7].x = a7b7c7d7e7f7g7h7; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 Eigen::half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN32 Eigen::half out[4][8]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} - - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -typedef struct { - __m64 x; -} Packet4h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4h type; - // There is no half-size packet for Packet4h. - typedef Packet4h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasDiv = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; - -template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { - Packet4h result; - result.x = _mm_set1_pi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); -} - -template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha + hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha * hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE Packet4h -ploadquad(const Eigen::half* from) { - return pset1(*from); -} - -template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) -{ - Packet4h result; - result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) -{ - __int64_t a = _mm_cvtm64_si64(from.x); - to[stride*0].x = static_cast(a); - to[stride*1].x = static_cast(a >> 16); - to[stride*2].x = static_cast(a >> 32); - to[stride*3].x = static_cast(a >> 48); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); - __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); - - kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); - kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); - kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); - kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); -} - -#endif - -} -} - -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h deleted file mode 100644 index aa5fbce8..00000000 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ /dev/null @@ -1,212 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H - -namespace Eigen { - -namespace internal { - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(a); - #else - return Eigen::half(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(static_cast(a)); - #else - return Eigen::half(static_cast(a)); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __half2float(a); - #else - return static_cast(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { - float2 r1 = __half22float2(a); - float2 r2 = __half22float2(b); - return make_float4(r1.x, r1.y, r2.x, r2.y); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { - // Simply discard the second half of the input - return __floats2half2_rn(a.x, a.y); -} - -#elif defined EIGEN_VECTORIZE_AVX512 -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { - return float2half(a); -} - -#elif defined EIGEN_VECTORIZE_AVX - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { - return float2half(a); -} - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#elif 0 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - Eigen::half h = raw_uint16_to_half(static_cast(a64)); - float f1 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 16)); - float f2 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 32)); - float f3 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 48)); - float f4 = static_cast(h); - return _mm_set_ps(f4, f3, f2, f1); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { - EIGEN_ALIGN16 float aux[4]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - - Packet4h result; - result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); - return result; -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/BFloat16.h b/externals/eigen/Eigen/src/Core/arch/Default/BFloat16.h new file mode 100644 index 00000000..1c28f4f9 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/BFloat16.h @@ -0,0 +1,700 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef EIGEN_BFLOAT16_H +#define EIGEN_BFLOAT16_H + +#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \ + template <> \ + EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \ + PACKET_BF16 METHOD(const PACKET_BF16& _x) { \ + return F32ToBf16(METHOD(Bf16ToF32(_x))); \ + } + +namespace Eigen { + +struct bfloat16; + +namespace bfloat16_impl { + +// Make our own __bfloat16_raw definition. +struct __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {} + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {} + unsigned short value; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value); +template +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying: +// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h); + +struct bfloat16_base : public __bfloat16_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {} +}; + +} // namespace bfloat16_impl + +// Class definition. +struct bfloat16 : public bfloat16_impl::bfloat16_base { + + typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {} + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b) + : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {} + + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne::value>(static_cast(val))) {} + + explicit EIGEN_DEVICE_FUNC bfloat16(float f) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex& val) + : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne(static_cast(val.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return bfloat16_impl::bfloat16_to_float(*this); + } +}; +} // namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = std::denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = numeric_limits::round_style; + static const bool is_iec559 = false; + static const bool is_bounded = true; + static const bool is_modulo = false; + static const int digits = 8; + static const int digits10 = 2; + static const int max_digits10 = 4; + static const int radix = 2; + static const int min_exponent = numeric_limits::min_exponent; + static const int min_exponent10 = numeric_limits::min_exponent10; + static const int max_exponent = numeric_limits::max_exponent; + static const int max_exponent10 = numeric_limits::max_exponent10; + static const bool traps = numeric_limits::traps; + static const bool tinyness_before = numeric_limits::tinyness_before; + + static Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); } + static Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); } + static Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); } + static Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); } + static Eigen::bfloat16 round_error() { return Eigen::bfloat16(0x3f00); } + static Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); } + static Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); } + static Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f81); } + static Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // namespace std + +namespace Eigen { + +namespace bfloat16_impl { + +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* BF16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_NATIVE_BF16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs, mostly working through conversion +// to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) { + return bfloat16(float(a) + static_cast(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) { + return bfloat16(static_cast(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) { + return bfloat16(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) { + bfloat16 result; + result.value = a.value ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) { + a = bfloat16(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) { + a += bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) { + a -= bfloat16(1); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) { + bfloat16 original_value = a; + ++a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) { + bfloat16 original_value = a; + --a; + return original_value; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) { + return float(a) >= float(b); +} + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for bfloat16 floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to bfloat16. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) { + return bfloat16(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) { + __bfloat16_raw output; + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { + output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; + return output; + } + const uint16_t* p = reinterpret_cast(&v); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + output.value = p[0]; +#else + output.value = p[1]; +#endif + return output; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) { + return __bfloat16_raw(value); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) { + return bf.value; +} + +// float_to_bfloat16_rtne template specialization that does not make any +// assumption about the value of its function argument (ff). +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + __bfloat16_raw output; + + if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(ff)) { + // If the value is a NaN, squash it to a qNaN with msb of fraction set, + // this makes sure after truncation we don't end up with an inf. + // + // qNaN magic: All exponent bits set + most significant bit of fraction + // set. + output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0; + } else { + // Fast rounding algorithm that rounds a half value to nearest even. This + // reduces expected error when we convert a large number of floats. Here + // is how it works: + // + // Definitions: + // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits + // with the following tags: + // + // Sign | Exp (8 bits) | Frac (23 bits) + // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT + // + // S: Sign bit. + // E: Exponent bits. + // F: First 6 bits of fraction. + // L: Least significant bit of resulting bfloat16 if we truncate away the + // rest of the float32. This is also the 7th bit of fraction + // R: Rounding bit, 8th bit of fraction. + // T: Sticky bits, rest of fraction, 15 bits. + // + // To round half to nearest even, there are 3 cases where we want to round + // down (simply truncate the result of the bits away, which consists of + // rounding bit and sticky bits) and two cases where we want to round up + // (truncate then add one to the result). + // + // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of + // 1s) as the rounding bias, adds the rounding bias to the input, then + // truncates the last 16 bits away. + // + // To understand how it works, we can analyze this algorithm case by case: + // + // 1. L = 0, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input may create any carry, depending on + // whether there is any value set to 1 in T bits. + // - R may be set to 1 if there is a carry. + // - L remains 0. + // - Note that this case also handles Inf and -Inf, where all fraction + // bits, including L, R and Ts are all 0. The output remains Inf after + // this algorithm. + // + // 2. L = 1, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits but + // adds 1 to rounding bit. + // - L remains 1. + // + // 3. L = 0, R = 1, all of T are 0: + // Expect: round down, this is exactly at half, the result is already + // even (L=0). + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input sets all sticky bits to 1, but + // doesn't create a carry. + // - R remains 1. + // - L remains 0. + // + // 4. L = 1, R = 1: + // Expect: round up, this is exactly at half, the result needs to be + // round to the next even number. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits, but + // creates a carry from rounding bit. + // - The carry sets L to 0, creates another carry bit and propagate + // forward to F bits. + // - If all the F bits are 1, a carry then propagates to the exponent + // bits, which then creates the minimum value with the next exponent + // value. Note that we won't have the case where exponents are all 1, + // since that's either a NaN (handled in the other if condition) or inf + // (handled in case 1). + // + // 5. L = 0, R = 1, any of T is 1: + // Expect: round up, this is greater than half. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input creates a carry from sticky bits, + // sets rounding bit to 0, then create another carry. + // - The second carry sets L to 1. + // + // Examples: + // + // Exact half value that is already even: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 + // + // This falls into case 3. We truncate the rest of 16 bits and no + // carry is created into F and L: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // Exact half value, round to next even number: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 + // + // This falls into case 4. We create a carry from R and T, + // which then propagates into L and F: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // + // Max denormal value round to min normal value: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 + // + // Max normal value round to Inf: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 + + // At this point, ff must be either a normal float, or +/-infinity. + output = float_to_bfloat16_rtne(ff); + } + return output; +#endif +} + +// float_to_bfloat16_rtne template specialization that assumes that its function +// argument (ff) is either a normal floating point number, or +/-infinity, or +// zero. Used to improve the runtime performance of conversion from an integer +// type to bfloat16. +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_BF16) && defined(EIGEN_HAS_HIP_BF16)) + // Nothing to do here +#else + numext::uint32_t input = numext::bit_cast(ff); + __bfloat16_raw output; + + // Least significant bit of resulting bfloat. + numext::uint32_t lsb = (input >> 16) & 1; + numext::uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + output.value = static_cast(input >> 16); + return output; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) { + float result = 0; + unsigned short* q = reinterpret_cast(&result); +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = h.value; +#else + q[1] = h.value; +#endif + return result; +} +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) { + EIGEN_USING_STD(isinf); + return (isinf)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) { + EIGEN_USING_STD(isnan); + return (isnan)(float(a)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) { + bfloat16 result; + result.value = a.value & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { + return bfloat16(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { + return bfloat16(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { + return bfloat16(::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { + return bfloat16(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { + return bfloat16(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) { + return bfloat16(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { + return bfloat16(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) { + return bfloat16(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { + return bfloat16(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { + return bfloat16(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { + return bfloat16(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { + return bfloat16(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { + return bfloat16(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { + return bfloat16(::atanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { + return bfloat16(::sinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { + return bfloat16(::coshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { + return bfloat16(::tanhf(float(a))); +} +#if EIGEN_HAS_CXX11_MATH +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { + return bfloat16(::asinhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { + return bfloat16(::acoshf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { + return bfloat16(::atanhf(float(a))); +} +#endif +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { + return bfloat16(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { + return bfloat16(::ceilf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { + return bfloat16(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { + return bfloat16(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) { + return bfloat16(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fminf(f1, f2)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) { + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return bfloat16(::fmaxf(f1, f2)); +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) { + os << static_cast(v); + return os; +} +#endif + +} // namespace bfloat16_impl + +namespace internal { + +template<> +struct random_default_impl +{ + static inline bfloat16 run(const bfloat16& x, const bfloat16& y) + { + return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX)); + } + static inline bfloat16 run() + { + return run(bfloat16(-1.f), bfloat16(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D); // bfloat16(5e-2f); + + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() { + return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() { + return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); + } +}; + +} // namespace Eigen + +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::bfloat16& h) { + return (bfloat16_impl::isfinite)(h); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast(const uint16_t& src) { + return Eigen::bfloat16(Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::bfloat16& src) { + return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // namespace std +#endif + + +#endif // EIGEN_BFLOAT16_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/ConjHelper.h b/externals/eigen/Eigen/src/Core/arch/Default/ConjHelper.h new file mode 100644 index 00000000..53830b5a --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/ConjHelper.h @@ -0,0 +1,117 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_CONJ_HELPER_H +#define EIGEN_ARCH_CONJ_HELPER_H + +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \ + const PACKET_CPLX& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \ + const PACKET_CPLX& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); \ + } \ + }; \ + \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \ + const PACKET_REAL& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \ + const PACKET_REAL& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); \ + } \ + }; + +namespace Eigen { +namespace internal { + +template struct conj_if; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); } +}; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } +}; + +// Generic Implementation, assume scalars since the packet-version is +// specialized below. +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsType& x, const RhsType& y) const + { return conj_if()(x) * conj_if()(y); } +}; + +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsScalar& x, const RhsScalar& y) const + { return numext::conj(x * y); } +}; + +// Implementation with equal type, use packet operations. +template +struct conj_helper +{ + typedef Packet ResultType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } +}; + +template +struct conj_helper +{ + typedef Packet ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return pconj(Eigen::internal::pmul(x, y)); } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h new file mode 100644 index 00000000..c9fbaf68 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -0,0 +1,1649 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2009-2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The exp and log functions of this file initially come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Creates a Scalar integer type with same bit-width. +template struct make_integer; +template<> struct make_integer { typedef numext::int32_t type; }; +template<> struct make_integer { typedef numext::int64_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; +template<> struct make_integer { typedef numext::int16_t type; }; + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::integer_packet PacketI; + enum { mantissa_bits = numext::numeric_limits::digits - 1}; + return pcast(plogical_shift_right(preinterpret(pabs(a)))); +} + +// Safely applies frexp, correctly handles denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent) { + typedef typename unpacket_traits::type Scalar; + typedef typename make_unsigned::type>::type ScalarUI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask = + ~(((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)) << int(MantissaBits)); // ~0x7f800000 + const Packet sign_mantissa_mask = pset1frombits(static_cast(scalar_sign_mantissa_mask)); + const Packet half = pset1(Scalar(0.5)); + const Packet zero = pzero(a); + const Packet normal_min = pset1((numext::numeric_limits::min)()); // Minimum normal value, 2^-126 + + // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1). + const Packet is_denormal = pcmp_lt(pabs(a), normal_min); + EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(int(MantissaBits) + 1); // 24 + // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr. + const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24 + const Packet normalization_factor = pset1(scalar_normalization_factor); + const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a); + + // Determine exponent offset: -126 if normal, -126-24 if denormal + const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(int(ExponentBits)-1)) - ScalarUI(2)); // -126 + Packet exponent_offset = pset1(scalar_exponent_offset); + const Packet normalization_offset = pset1(-Scalar(scalar_normalization_offset)); // -24 + exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset); + + // Determine exponent and mantissa from normalized_a. + exponent = pfrexp_generic_get_biased_exponent(normalized_a); + // Zero, Inf and NaN return 'a' unmodified, exponent is zero + // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero) + const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << int(ExponentBits)) - ScalarUI(1)); // 255 + const Packet non_finite_exponent = pset1(scalar_non_finite_exponent); + const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent)); + const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half)); + exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset)); + return m; +} + +// Safely applies ldexp, correctly handles overflows, underflows and denormals. +// Assumes IEEE floating point format. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent) { + // We want to return a * 2^exponent, allowing for all possible integer + // exponents without overflowing or underflowing in intermediate + // computations. + // + // Since 'a' and the output can be denormal, the maximum range of 'exponent' + // to consider for a float is: + // -255-23 -> 255+23 + // Below -278 any finite float 'a' will become zero, and above +278 any + // finite float will become inf, including when 'a' is the smallest possible + // denormal. + // + // Unfortunately, 2^(278) cannot be represented using either one or two + // finite normal floats, so we must split the scale factor into at least + // three parts. It turns out to be faster to split 'exponent' into four + // factors, since [exponent>>2] is much faster to compute that [exponent/3]. + // + // Set e = min(max(exponent, -278), 278); + // b = floor(e/4); + // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) + // + // This will avoid any intermediate overflows and correctly handle 0, inf, + // NaN cases. + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + const Packet max_exponent = pset1(Scalar((ScalarI(1)<((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127 + const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + out = pmul(out, c); + return out; +} + +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [NumTraits::min_exponent()-2, NumTraits::max_exponent()] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template +struct pldexp_fast_impl { + typedef typename unpacket_traits::integer_packet PacketI; + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = numext::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + Packet run(const Packet& a, const Packet& exponent) { + const Packet bias = pset1(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1)<(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret(plogical_shift_left(e))); + } +}; + +// Natural or base 2 logarithm. +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_float(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0f); + const Packet cst_neg_half = pset1(-0.5f); + // The smallest non denormalized float number. + const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); + const Packet cst_minus_inf = pset1frombits( 0xff800000u); + const Packet cst_pos_inf = pset1frombits( 0x7f800000u); + + // Polynomial coefficients. + const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); + const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); + const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); + const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); + const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); + const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); + const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); + const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); + const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); + const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y2 = pmadd(y2, x, cst_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x) +{ + return plog_impl_float(_x); +} + +/* Returns the base e (2.718...) or base 2 logarithm of x. + * The argument is separated into its exponent and fractional parts. + * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)], + * is approximated by + * + * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x). + * + * for more detail see: http://www.netlib.org/cephes/ + */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_impl_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_neg_half = pset1(-0.5); + // The smallest non denormalized double. + const Packet cst_min_norm_pos = pset1frombits( static_cast(0x0010000000000000ull)); + const Packet cst_minus_inf = pset1frombits( static_cast(0xfff0000000000000ull)); + const Packet cst_pos_inf = pset1frombits( static_cast(0x7ff0000000000000ull)); + + + // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x) + // 1/sqrt(2) <= x < sqrt(2) + const Packet cst_cephes_SQRTHF = pset1(0.70710678118654752440E0); + const Packet cst_cephes_log_p0 = pset1(1.01875663804580931796E-4); + const Packet cst_cephes_log_p1 = pset1(4.97494994976747001425E-1); + const Packet cst_cephes_log_p2 = pset1(4.70579119878881725854E0); + const Packet cst_cephes_log_p3 = pset1(1.44989225341610930846E1); + const Packet cst_cephes_log_p4 = pset1(1.79368678507819816313E1); + const Packet cst_cephes_log_p5 = pset1(7.70838733755885391666E0); + + const Packet cst_cephes_log_q0 = pset1(1.0); + const Packet cst_cephes_log_q1 = pset1(1.12873587189167450590E1); + const Packet cst_cephes_log_q2 = pset1(4.52279145837532221105E1); + const Packet cst_cephes_log_q3 = pset1(8.29875266912776603211E1); + const Packet cst_cephes_log_q4 = pset1(7.11544750618563894466E1); + const Packet cst_cephes_log_q5 = pset1(2.31251620126765340583E1); + + // Truncate input values to the minimum positive normal. + x = pmax(x, cst_min_norm_pos); + + Packet e; + // extract significant in the range [0.5,1) and exponent + x = pfrexp(x,e); + + // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); + Packet tmp = pand(x, mask); + x = psub(x, cst_1); + e = psub(e, pand(cst_1, mask)); + x = padd(x, tmp); + + Packet x2 = pmul(x, x); + Packet x3 = pmul(x2, x); + + // Evaluate the polynomial approximant , probably to improve instruction-level parallelism. + // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) ); + Packet y, y1, y_; + y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); + y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); + y = pmadd(y, x, cst_cephes_log_p2); + y1 = pmadd(y1, x, cst_cephes_log_p5); + y_ = pmadd(y, x3, y1); + + y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1); + y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4); + y = pmadd(y, x, cst_cephes_log_q2); + y1 = pmadd(y1, x, cst_cephes_log_q5); + y = pmadd(y, x3, y1); + + y_ = pmul(y_, x3); + y = pdiv(y_, y); + + y = pmadd(cst_neg_half, x2, y); + x = padd(x, y); + + // Add the logarithm of the exponent back to the result of the interpolation. + if (base2) { + const Packet cst_log2e = pset1(static_cast(EIGEN_LOG2E)); + x = pmadd(x, cst_log2e, e); + } else { + const Packet cst_ln2 = pset1(static_cast(EIGEN_LN2)); + x = pmadd(e, cst_ln2, x); + } + + Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); + Packet iszero_mask = pcmp_eq(_x,pzero(_x)); + Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN + // - 0 will be -INF + // - +INF will be +INF + return pselect(iszero_mask, cst_minus_inf, + por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x) +{ + return plog_impl_double(_x); +} + +/** \internal \returns log(1 + x) computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_plog1p(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + Packet xp1 = padd(x, one); + Packet small_mask = pcmp_eq(xp1, one); + Packet log1 = plog(xp1); + Packet inf_mask = pcmp_eq(xp1, log1); + Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); + return pselect(por(small_mask, inf_mask), x, log_large); +} + +/** \internal \returns exp(x)-1 computed using W. Kahan's formula. + See: http://www.plunk.org/~hatch/rightway.php + */ +template +Packet generic_expm1(const Packet& x) +{ + typedef typename unpacket_traits::type ScalarType; + const Packet one = pset1(ScalarType(1)); + const Packet neg_one = pset1(ScalarType(-1)); + Packet u = pexp(x); + Packet one_mask = pcmp_eq(u, one); + Packet u_minus_one = psub(u, one); + Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one); + Packet logu = plog(u); + // The following comparison is to catch the case where + // exp(x) = +inf. It is written in this way to avoid having + // to form the constant +inf, which depends on the packet + // type. + Packet pos_inf_mask = pcmp_eq(logu, u); + Packet expm1 = pmul(u_minus_one, pdiv(x, logu)); + expm1 = pselect(pos_inf_mask, u, expm1); + return pselect(one_mask, + x, + pselect(neg_one_mask, + neg_one, + expm1)); +} + + +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x) +{ + const Packet cst_1 = pset1(1.0f); + const Packet cst_half = pset1(0.5f); + const Packet cst_exp_hi = pset1( 88.723f); + const Packet cst_exp_lo = pset1(-88.723f); + + const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); + const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); + const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); + const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); + const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); + const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); + const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); + + // Clamp x. + Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); + + // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is + // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating + // truncation errors. + const Packet cst_cephes_exp_C1 = pset1(-0.693359375f); + const Packet cst_cephes_exp_C2 = pset1(2.12194440e-4f); + Packet r = pmadd(m, cst_cephes_exp_C1, x); + r = pmadd(m, cst_cephes_exp_C2, r); + + Packet r2 = pmul(r, r); + Packet r3 = pmul(r2, r); + + // Evaluate the polynomial approximant,improved by instruction-level parallelism. + Packet y, y1, y2; + y = pmadd(cst_cephes_exp_p0, r, cst_cephes_exp_p1); + y1 = pmadd(cst_cephes_exp_p3, r, cst_cephes_exp_p4); + y2 = padd(r, cst_1); + y = pmadd(y, r, cst_cephes_exp_p2); + y1 = pmadd(y1, r, cst_cephes_exp_p5); + y = pmadd(y, r3, y1); + y = pmadd(y, r2, y2); + + // Return 2^m * exp(r). + // TODO: replace pldexp with faster implementation since y in [-1, 1). + return pmax(pldexp(y,m), _x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x) +{ + Packet x = _x; + + const Packet cst_1 = pset1(1.0); + const Packet cst_2 = pset1(2.0); + const Packet cst_half = pset1(0.5); + + const Packet cst_exp_hi = pset1(709.784); + const Packet cst_exp_lo = pset1(-709.784); + + const Packet cst_cephes_LOG2EF = pset1(1.4426950408889634073599); + const Packet cst_cephes_exp_p0 = pset1(1.26177193074810590878e-4); + const Packet cst_cephes_exp_p1 = pset1(3.02994407707441961300e-2); + const Packet cst_cephes_exp_p2 = pset1(9.99999999999999999910e-1); + const Packet cst_cephes_exp_q0 = pset1(3.00198505138664455042e-6); + const Packet cst_cephes_exp_q1 = pset1(2.52448340349684104192e-3); + const Packet cst_cephes_exp_q2 = pset1(2.27265548208155028766e-1); + const Packet cst_cephes_exp_q3 = pset1(2.00000000000000000009e0); + const Packet cst_cephes_exp_C1 = pset1(0.693145751953125); + const Packet cst_cephes_exp_C2 = pset1(1.42860682030941723212e-6); + + Packet tmp, fx; + + // clamp x + x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(cst_cephes_LOG2EF, x, cst_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = pfloor(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, cst_cephes_exp_C1); + Packet z = pmul(fx, cst_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet px = cst_cephes_exp_p0; + px = pmadd(px, x2, cst_cephes_exp_p1); + px = pmadd(px, x2, cst_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet qx = cst_cephes_exp_q0; + qx = pmadd(qx, x2, cst_cephes_exp_q1); + qx = pmadd(qx, x2, cst_cephes_exp_q2); + qx = pmadd(qx, x2, cst_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = pdiv(px, psub(qx, px)); + x = pmadd(cst_2, x, cst_1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + // TODO: replace pldexp with faster implementation since x in [-1, 1). + return pmax(pldexp(x,fx), _x); +} + +// The following code is inspired by the following stack-overflow answer: +// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 +// It has been largely optimized: +// - By-pass calls to frexp. +// - Aligned loads of required 96 bits of 2/pi. This is accomplished by +// (1) balancing the mantissa and exponent to the required bits of 2/pi are +// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. +// - Avoid a branch in rounding and extraction of the remaining fractional part. +// Overall, I measured a speed up higher than x2 on x86-64. +inline float trig_reduce_huge (float xf, int *quadrant) +{ + using Eigen::numext::int32_t; + using Eigen::numext::uint32_t; + using Eigen::numext::int64_t; + using Eigen::numext::uint64_t; + + const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 + const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt + + // 192 bits of 2/pi for Payne-Hanek reduction + // Bits are introduced by packet of 8 to enable aligned reads. + static const uint32_t two_over_pi [] = + { + 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, + 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, + 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, + 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, + 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, + 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, + 0x10e41000, 0xe4100000 + }; + + uint32_t xi = numext::bit_cast(xf); + // Below, -118 = -126 + 8. + // -126 is to get the exponent, + // +8 is to enable alignment of 2/pi's bits on 8 bits. + // This is possible because the fractional part of x as only 24 meaningful bits. + uint32_t e = (xi >> 23) - 118; + // Extract the mantissa and shift it to align it wrt the exponent + xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); + + uint32_t i = e >> 3; + uint32_t twoopi_1 = two_over_pi[i-1]; + uint32_t twoopi_2 = two_over_pi[i+3]; + uint32_t twoopi_3 = two_over_pi[i+7]; + + // Compute x * 2/pi in 2.62-bit fixed-point format. + uint64_t p; + p = uint64_t(xi) * twoopi_3; + p = uint64_t(xi) * twoopi_2 + (p >> 32); + p = (uint64_t(xi * twoopi_1) << 32) + p; + + // Round to nearest: add 0.5 and extract integral part. + uint64_t q = (p + zero_dot_five) >> 62; + *quadrant = int(q); + // Now it remains to compute "r = x - q*pi/2" with high accuracy, + // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: + // r = (p-q)*pi/2, + // where the product can be be carried out with sufficient accuracy using double precision. + p -= q<<62; + return float(double(int64_t(p)) * pio2_62); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT +__attribute__((optimize("-fno-unsafe-math-optimizations"))) +#endif +Packet psincos_float(const Packet& _x) +{ + typedef typename unpacket_traits::integer_packet PacketI; + + const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI + const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding + const PacketI csti_1 = pset1(1); + const Packet cst_sign_mask = pset1frombits(0x80000000u); + + Packet x = pabs(_x); + + // Scale x by 2/Pi to find x's octant. + Packet y = pmul(x, cst_2oPI); + + // Rounding trick: + Packet y_round = padd(y, cst_rounding_magic); + EIGEN_OPTIMIZATION_BARRIER(y_round) + PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) + y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // using "Extended precision modular arithmetic" + #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) + // This version requires true FMA for high accuracy + // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): + const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; + x = pmadd(y, pset1(-1.57079601287841796875f), x); + x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); + x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); + #else + // Without true FMA, the previous set of coefficients maintain 1ULP accuracy + // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. + // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. + + // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. + // and 2 ULP up to: + const float huge_th = ComputeSine ? 25966.f : 18838.f; + x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 + EIGEN_OPTIMIZATION_BARRIER(x) + x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 + x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee + + // For the record, the following set of coefficients maintain 2ULP up + // to a slightly larger range: + // const float huge_th = ComputeSine ? 51981.f : 39086.125f; + // but it slightly fails to maintain 1ULP for two values of sin below pi. + // x = pmadd(y, pset1(-3.140625/2.), x); + // x = pmadd(y, pset1(-0.00048351287841796875), x); + // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); + // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); + + // For the record, with only 3 iterations it is possible to maintain + // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. + // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee + #endif + + if(predux_any(pcmp_le(pset1(huge_th),pabs(_x)))) + { + const int PacketSize = unpacket_traits::size; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; + EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; + pstoreu(vals, pabs(_x)); + pstoreu(x_cpy, x); + pstoreu(y_int2, y_int); + for(int k=0; k=huge_th && (numext::isfinite)(val)) + x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); + } + x = ploadu(x_cpy); + y_int = ploadu(y_int2); + } + + // Compute the sign to apply to the polynomial. + // sin: sign = second_bit(y_int) xor signbit(_x) + // cos: sign = second_bit(y_int+1) + Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(plogical_shift_left<30>(y_int))) + : preinterpret(plogical_shift_left<30>(padd(y_int,csti_1))); + sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit + + // Get the polynomial selection mask from the second bit of y_int + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); + + Packet x2 = pmul(x,x); + + // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) + Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); + y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); + y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); + y1 = pmadd(y1, x2, pset1(-0.5f)); + y1 = pmadd(y1, x2, pset1(1.f)); + + // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) + // octave/matlab code to compute those coefficients: + // x = (0:0.0001:pi/4)'; + // A = [x.^3 x.^5 x.^7]; + // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy + // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 + // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) + // + Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); + y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); + y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); + y2 = pmul(y2, x2); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = ComputeSine ? pselect(poly_mask,y2,y1) + : pselect(poly_mask,y1,y2); + + // Update the sign and filter huge inputs + return pxor(y, sign_bit); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x) +{ + return psincos_float(x); +} + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x) +{ + return psincos_float(x); +} + + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename unpacket_traits::as_real RealPacket; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in interleaved format + // a = [a0, a1] = [x0, y0, x1, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, v0, u1, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l0, l1, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|] + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|] + RealPacket a_max = pmax(a_abs, a_abs_flip); + RealPacket a_min = pmin(a_abs, a_abs_flip); + RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); + RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + const RealPacket cst_one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1] + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, *, rho1, *], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + Packet rho; + rho.v = psqrt(pmul(cst_half, padd(a_abs, l))); + + // Step 3. Compute [rho0, eta0, rho1, eta1], where + // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask); + RealPacket real_mask = peven_mask(a.v); + Packet positive_real_result; + // Compute result for inputs with positive real part. + positive_real_result.v = pselect(real_mask, rho.v, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] + const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); + const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; + RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); + Packet negative_real_result; + // Notice that rho is positive, so taking it's absolute value is a noop. + negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs); + + // Step 5. Select solution branch based on the sign of the real parts. + Packet negative_real_mask; + negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v)); + negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v); + Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + Packet is_inf; + is_inf.v = pcmp_eq(a_abs, cst_pos_inf); + Packet is_real_inf; + is_real_inf.v = pand(is_inf.v, real_mask); + is_real_inf = por(is_real_inf, pcplxflip(is_real_inf)); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + Packet real_inf_result; + real_inf_result.v = pmul(a_abs, pset1(Scalar(RealScalar(1.0), RealScalar(0.0))).v); + real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + Packet is_imag_inf; + is_imag_inf.v = pandnot(is_inf.v, real_mask); + is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf)); + Packet imag_inf_result; + imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask)); + + return pselect(is_imag_inf, imag_inf_result, + pselect(is_real_inf, real_inf_result,result)); +} + +// TODO(rmlarsen): The following set of utilities for double word arithmetic +// should perhaps be refactored as a separate file, since it would be generally +// useful for special function implementation etc. Writing the algorithms in +// terms if a double word type would also make the code more readable. + +// This function splits x into the nearest integer n and fractional part r, +// such that x = n + r holds exactly. +template +EIGEN_STRONG_INLINE +void absolute_split(const Packet& x, Packet& n, Packet& r) { + n = pround(x); + r = psub(x, n); +} + +// This function computes the sum {s, r}, such that x + y = s_hi + s_lo +// holds exactly, and s_hi = fl(x+y), if |x| >= |y|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { + s_hi = padd(x, y); + const Packet t = psub(s_hi, x); + s_lo = psub(y, t); +} + +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +// This function implements the extended precision product of +// a pair of floating point numbers. Given {x, y}, it computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + p_hi = pmul(x, y); + p_lo = pmadd(x, y, pnegate(p_hi)); +} + +#else + +// This function implements the Veltkamp splitting. Given a floating point +// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds +// exactly and that half of the significant of x fits in x_hi. +// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) { + typedef typename unpacket_traits::type Scalar; + EIGEN_CONSTEXPR int shift = (NumTraits::digits() + 1) / 2; + const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr. + const Packet gamma = pmul(pset1(shift_scale + Scalar(1)), x); + Packet rho = psub(x, gamma); + x_hi = padd(rho, gamma); + x_lo = psub(x, x_hi); +} + +// This function implements Dekker's algorithm for products x * y. +// Given floating point numbers {x, y} computes the pair +// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and +// p_hi = fl(x * y). +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet x_hi, x_lo, y_hi, y_lo; + veltkamp_splitting(x, x_hi, x_lo); + veltkamp_splitting(y, y_hi, y_lo); + + p_hi = pmul(x, y); + p_lo = pmadd(x_hi, y_hi, pnegate(p_hi)); + p_lo = pmadd(x_hi, y_lo, p_lo); + p_lo = pmadd(x_lo, y_hi, p_lo); + p_lo = pmadd(x_lo, y_lo, p_lo); +} + +#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD + + +// This function implements Dekker's algorithm for the addition +// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {s_hi, s_lo} such that +// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly. +// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE + void twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi)); + Packet r_hi_1, r_lo_1; + fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1); + Packet r_hi_2, r_lo_2; + fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2); + const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2); + + const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo); + const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo); + const Packet s = pselect(x_greater_mask, s1, s2); + + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for double word numbers, +// which assumes that |x_hi| >= |y_hi|. +template +EIGEN_STRONG_INLINE + void fast_twosum(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x_hi, y_hi, r_hi, r_lo); + const Packet s = padd(padd(y_lo, r_lo), x_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This is a version of twosum for adding a floating point number x to +// double word number {y_hi, y_lo} number, with the assumption +// that |x| >= |y_hi|. +template +EIGEN_STRONG_INLINE +void fast_twosum(const Packet& x, + const Packet& y_hi, const Packet& y_lo, + Packet& s_hi, Packet& s_lo) { + Packet r_hi, r_lo; + fast_twosum(x, y_hi, r_hi, r_lo); + const Packet s = padd(y_lo, r_lo); + fast_twosum(r_hi, s, s_hi, s_lo); +} + +// This function implements the multiplication of a double word +// number represented by {x_hi, x_lo} by a floating point number y. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions", +// 3rd edition, Birkh\"auser, 2016. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, + Packet& p_hi, Packet& p_lo) { + Packet c_hi, c_lo1; + twoprod(x_hi, y, c_hi, c_lo1); + const Packet c_lo2 = pmul(x_lo, y); + Packet t_hi, t_lo1; + fast_twosum(c_hi, c_lo2, t_hi, t_lo1); + const Packet t_lo2 = padd(t_lo1, c_lo1); + fast_twosum(t_hi, t_lo2, p_hi, p_lo); +} + +// This function implements the multiplication of two double word +// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. +// It returns the result as a pair {p_hi, p_lo} such that +// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error +// of less than 2*2^{-2p}, where p is the number of significand bit +// in the floating point type. +template +EIGEN_STRONG_INLINE +void twoprod(const Packet& x_hi, const Packet& x_lo, + const Packet& y_hi, const Packet& y_lo, + Packet& p_hi, Packet& p_lo) { + Packet p_hi_hi, p_hi_lo; + twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo); + Packet p_lo_hi, p_lo_lo; + twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo); + fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo); +} + +// This function computes the reciprocal of a floating point number +// with extra precision and returns the result as a double word. +template +void doubleword_reciprocal(const Packet& x, Packet& recip_hi, Packet& recip_lo) { + typedef typename unpacket_traits::type Scalar; + // 1. Approximate the reciprocal as the reciprocal of the high order element. + Packet approx_recip = prsqrt(x); + approx_recip = pmul(approx_recip, approx_recip); + + // 2. Run one step of Newton-Raphson iteration in double word arithmetic + // to get the bottom half. The NR iteration for reciprocal of 'a' is + // x_{i+1} = x_i * (2 - a * x_i) + + // -a*x_i + Packet t1_hi, t1_lo; + twoprod(pnegate(x), approx_recip, t1_hi, t1_lo); + // 2 - a*x_i + Packet t2_hi, t2_lo; + fast_twosum(pset1(Scalar(2)), t1_hi, t2_hi, t2_lo); + Packet t3_hi, t3_lo; + fast_twosum(t2_hi, padd(t2_lo, t1_lo), t3_hi, t3_lo); + // x_i * (2 - a * x_i) + twoprod(t3_hi, t3_lo, approx_recip, recip_hi, recip_lo); +} + + +// This function computes log2(x) and returns the result as a double word. +template +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + log2_x_hi = plog2(x); + log2_x_lo = pzero(x); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.42e-10. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) { + // The function log(1+x)/x is approximated in the interval + // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form + // Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))), + // where the degree 6 polynomial P(x) is evaluated in single precision, + // while the remaining 4 terms of Q(x), as well as the final multiplication by x + // to reconstruct log(1+x) are evaluated in extra precision using + // double word arithmetic. C0 through C3 are extra precise constants + // stored as double words. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 10; + // > f = log2(1+x)/x; + // > interval = [sqrt(0.5)-1;sqrt(2)-1]; + // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating); + + const Packet p6 = pset1( 9.703654795885e-2f); + const Packet p5 = pset1(-0.1690667718648f); + const Packet p4 = pset1( 0.1720575392246f); + const Packet p3 = pset1(-0.1789081543684f); + const Packet p2 = pset1( 0.2050433009862f); + const Packet p1 = pset1(-0.2404672354459f); + const Packet p0 = pset1( 0.2885761857032f); + + const Packet C3_hi = pset1(-0.360674142838f); + const Packet C3_lo = pset1(-6.13283912543e-09f); + const Packet C2_hi = pset1(0.480897903442f); + const Packet C2_lo = pset1(-1.44861207474e-08f); + const Packet C1_hi = pset1(-0.721347510815f); + const Packet C1_lo = pset1(-4.84483164698e-09f); + const Packet C0_hi = pset1(1.44269502163f); + const Packet C0_lo = pset1(2.01711713999e-08f); + const Packet one = pset1(1.0f); + + const Packet x = psub(z, one); + // Evaluate P(x) in working precision. + // We evaluate it in multiple parts to improve instruction level + // parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p6, x2, p4); + p_even = pmadd(p_even, x2, p2); + p_even = pmadd(p_even, x2, p0); + Packet p_odd = pmadd(p5, x2, p3); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Now evaluate the low-order tems of Q(x) in double word precision. + // In the following, due to the alternating signs and the fact that + // |x| < sqrt(2)-1, we can assume that |C*_hi| >= q_i, and use + // fast_twosum instead of the slower twosum. + Packet q_hi, q_lo; + Packet t_hi, t_lo; + // C3 + x * p(x) + twoprod(p, x, t_hi, t_lo); + fast_twosum(C3_hi, C3_lo, t_hi, t_lo, q_hi, q_lo); + // C2 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C2_hi, C2_lo, t_hi, t_lo, q_hi, q_lo); + // C1 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C1_hi, C1_lo, t_hi, t_lo, q_hi, q_lo); + // C0 + x * p(x) + twoprod(q_hi, q_lo, x, t_hi, t_lo); + fast_twosum(C0_hi, C0_lo, t_hi, t_lo, q_hi, q_lo); + + // log(z) ~= x * Q(x) + twoprod(q_hi, q_lo, x, log2_x_hi, log2_x_lo); + } +}; + +// This specialization uses a more accurate algorithm to compute log2(x) for +// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18. +// This additional accuracy is needed to counter the error-magnification +// inherent in multiplying by a potentially large exponent in pow(x,y). +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. + +template <> +struct accurate_log2 { + template + EIGEN_STRONG_INLINE + void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) { + // We use a transformation of variables: + // r = c * (x-1) / (x+1), + // such that + // log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r). + // The function f(r) can be approximated well using an odd polynomial + // of the form + // P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r, + // For the implementation of log2 here, Q is of degree 6 with + // coefficient represented in working precision (double), while C is a + // constant represented in extra precision as a double word to achieve + // full accuracy. + // + // The polynomial coefficients were computed by the Sollya script: + // + // c = 2 / log(2); + // trans = c * (x-1)/(x+1); + // itrans = (1+x/c)/(1-x/c); + // interval=[trans(sqrt(0.5)); trans(sqrt(2))]; + // print(interval); + // f = log2(itrans(x)); + // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating); + const Packet q12 = pset1(2.87074255468000586e-9); + const Packet q10 = pset1(2.38957980901884082e-8); + const Packet q8 = pset1(2.31032094540014656e-7); + const Packet q6 = pset1(2.27279857398537278e-6); + const Packet q4 = pset1(2.31271023278625638e-5); + const Packet q2 = pset1(2.47556738444535513e-4); + const Packet q0 = pset1(2.88543873228900172e-3); + const Packet C_hi = pset1(0.0400377511598501157); + const Packet C_lo = pset1(-4.77726582251425391e-19); + const Packet one = pset1(1.0); + + const Packet cst_2_log2e_hi = pset1(2.88539008177792677); + const Packet cst_2_log2e_lo = pset1(4.07660016854549667e-17); + // c * (x - 1) + Packet num_hi, num_lo; + twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), num_hi, num_lo); + // TODO(rmlarsen): Investigate if using the division algorithm by + // Muller et al. is faster/more accurate. + // 1 / (x + 1) + Packet denom_hi, denom_lo; + doubleword_reciprocal(padd(x, one), denom_hi, denom_lo); + // r = c * (x-1) / (x+1), + Packet r_hi, r_lo; + twoprod(num_hi, num_lo, denom_hi, denom_lo, r_hi, r_lo); + // r2 = r * r + Packet r2_hi, r2_lo; + twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo); + // r4 = r2 * r2 + Packet r4_hi, r4_lo; + twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo); + + // Evaluate Q(r^2) in working precision. We evaluate it in two parts + // (even and odd in r^2) to improve instruction level parallelism. + Packet q_even = pmadd(q12, r4_hi, q8); + Packet q_odd = pmadd(q10, r4_hi, q6); + q_even = pmadd(q_even, r4_hi, q4); + q_odd = pmadd(q_odd, r4_hi, q2); + q_even = pmadd(q_even, r4_hi, q0); + Packet q = pmadd(q_odd, r2_hi, q_even); + + // Now evaluate the low order terms of P(x) in double word precision. + // In the following, due to the increasing magnitude of the coefficients + // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead + // of the slower twosum. + // Q(r^2) * r^2 + Packet p_hi, p_lo; + twoprod(r2_hi, r2_lo, q, p_hi, p_lo); + // Q(r^2) * r^2 + C + Packet p1_hi, p1_lo; + fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo); + // (Q(r^2) * r^2 + C) * r^2 + Packet p2_hi, p2_lo; + twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo); + // ((Q(r^2) * r^2 + C) * r^2 + 1) + Packet p3_hi, p3_lo; + fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo); + + // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r + twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo); + } +}; + +// This function computes exp2(x) (i.e. 2**x). +template +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // TODO(rmlarsen): Add a pexp2 packetop. + return pexp(pmul(pset1(Scalar(EIGEN_LN2)), x)); + } +}; + +// This specialization uses a faster algorithm to compute exp2(x) for floats +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 6 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 6; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,double,single...|],interval,relative,floating); + + const Packet p4 = pset1(1.539513905e-4f); + const Packet p3 = pset1(1.340007293e-3f); + const Packet p2 = pset1(9.618283249e-3f); + const Packet p1 = pset1(5.550328270e-2f); + const Packet p0 = pset1(0.2402264923f); + + const Packet C_hi = pset1(0.6931471825f); + const Packet C_lo = pset1(2.36836577e-08f); + const Packet one = pset1(1.0f); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p4, x2, p2); + Packet p_odd = pmadd(p3, x2, p1); + p_even = pmadd(p_even, x2, p0); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// in [-0.5;0.5] with a relative accuracy of 1 ulp. +// The minimax polynomial used was calculated using the Sollya tool. +// See sollya.org. +template <> +struct fast_accurate_exp2 { + template + EIGEN_STRONG_INLINE + Packet operator()(const Packet& x) { + // This function approximates exp2(x) by a degree 10 polynomial of the form + // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in + // single precision, and the remaining steps are evaluated with extra precision using + // double word arithmetic. C is an extra precise constant stored as a double word. + // + // The polynomial coefficients were calculated using Sollya commands: + // > n = 11; + // > f = 2^x; + // > interval = [-0.5;0.5]; + // > p = fpminimax(f,n,[|1,DD,double...|],interval,relative,floating); + + const Packet p9 = pset1(4.431642109085495276e-10); + const Packet p8 = pset1(7.073829923303358410e-9); + const Packet p7 = pset1(1.017822306737031311e-7); + const Packet p6 = pset1(1.321543498017646657e-6); + const Packet p5 = pset1(1.525273342728892877e-5); + const Packet p4 = pset1(1.540353045780084423e-4); + const Packet p3 = pset1(1.333355814685869807e-3); + const Packet p2 = pset1(9.618129107593478832e-3); + const Packet p1 = pset1(5.550410866481961247e-2); + const Packet p0 = pset1(0.240226506959101332); + const Packet C_hi = pset1(0.693147180559945286); + const Packet C_lo = pset1(4.81927865669806721e-17); + const Packet one = pset1(1.0); + + // Evaluate P(x) in working precision. + // We evaluate even and odd parts of the polynomial separately + // to gain some instruction level parallelism. + Packet x2 = pmul(x,x); + Packet p_even = pmadd(p8, x2, p6); + Packet p_odd = pmadd(p9, x2, p7); + p_even = pmadd(p_even, x2, p4); + p_odd = pmadd(p_odd, x2, p5); + p_even = pmadd(p_even, x2, p2); + p_odd = pmadd(p_odd, x2, p3); + p_even = pmadd(p_even, x2, p0); + p_odd = pmadd(p_odd, x2, p1); + Packet p = pmadd(p_odd, x, p_even); + + // Evaluate the remaining terms of Q(x) with extra precision using + // double word arithmetic. + Packet p_hi, p_lo; + // x * p(x) + twoprod(p, x, p_hi, p_lo); + // C + x * p(x) + Packet q1_hi, q1_lo; + twosum(p_hi, p_lo, C_hi, C_lo, q1_hi, q1_lo); + // x * (C + x * p(x)) + Packet q2_hi, q2_lo; + twoprod(q1_hi, q1_lo, x, q2_hi, q2_lo); + // 1 + x * (C + x * p(x)) + Packet q3_hi, q3_lo; + // Since |q2_hi| <= sqrt(2)-1 < 1, we can use fast_twosum + // for adding it to unity here. + fast_twosum(one, q2_hi, q3_hi, q3_lo); + return padd(q3_hi, padd(q2_lo, q3_lo)); + } +}; + +// This function implements the non-trivial case of pow(x,y) where x is +// positive and y is (possibly) non-integer. +// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x. +// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it +// easier to specialize or turn off for specific types and/or backends.x +template +EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + // Split x into exponent e_x and mantissa m_x. + Packet e_x; + Packet m_x = pfrexp(x, e_x); + + // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x). + EIGEN_CONSTEXPR Scalar sqrt_half = Scalar(0.70710678118654752440); + const Packet m_x_scale_mask = pcmp_lt(m_x, pset1(sqrt_half)); + m_x = pselect(m_x_scale_mask, pmul(pset1(Scalar(2)), m_x), m_x); + e_x = pselect(m_x_scale_mask, psub(e_x, pset1(Scalar(1))), e_x); + + // Compute log2(m_x) with 6 extra bits of accuracy. + Packet rx_hi, rx_lo; + accurate_log2()(m_x, rx_hi, rx_lo); + + // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled + // precision using double word arithmetic. + Packet f1_hi, f1_lo, f2_hi, f2_lo; + twoprod(e_x, y, f1_hi, f1_lo); + twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo); + // Sum the two terms in f using double word arithmetic. We know + // that |e_x| > |log2(m_x)|, except for the case where e_x==0. + // This means that we can use fast_twosum(f1,f2). + // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any + // accuracy by violating the assumption of fast_twosum, because + // it's a no-op. + Packet f_hi, f_lo; + fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo); + + // Split f into integer and fractional parts. + Packet n_z, r_z; + absolute_split(f_hi, n_z, r_z); + r_z = padd(r_z, f_lo); + Packet n_r; + absolute_split(r_z, n_r, r_z); + n_z = padd(n_z, n_r); + + // We now have an accurate split of f = n_z + r_z and can compute + // x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}. + // Since r_z is in [-0.5;0.5], we compute the first factor to high accuracy + // using a specialized algorithm. Multiplication by the second factor can + // be done exactly using pldexp(), since it is an integer power of 2. + const Packet e_r = fast_accurate_exp2()(r_z); + return pldexp(e_r, n_z); +} + +// Generic implementation of pow(x,y). +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet generic_pow(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::type Scalar; + + const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_zero = pset1(Scalar(0)); + const Packet cst_one = pset1(Scalar(1)); + const Packet cst_nan = pset1(NumTraits::quiet_NaN()); + + const Packet abs_x = pabs(x); + // Predicates for sign and magnitude of x. + const Packet x_is_zero = pcmp_eq(x, cst_zero); + const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); + const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); + + // Predicates for sign and magnitude of y. + const Packet y_is_one = pcmp_eq(y, cst_one); + const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet y_is_neg = pcmp_lt(y, cst_zero); + const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); + const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + EIGEN_CONSTEXPR Scalar huge_exponent = + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / + NumTraits::epsilon(); + const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); + + // Predicates for whether y is integer and/or even. + const Packet y_is_int = pcmp_eq(pfloor(y), y); + const Packet y_div_2 = pmul(y, pset1(Scalar(0.5))); + const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); + + // Predicates encoding special cases for the value of pow(x,y) + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), + y_is_int), + abs_y_is_inf); + const Packet pow_is_one = por(por(x_is_one, y_is_zero), + pand(x_is_neg_one, + por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); + const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), + pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), + pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), + y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), + y_is_pos)); + + // General computation of pow(x,y) for positive x or negative x and integer y. + const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); + const Packet pow_abs = generic_pow_impl(abs_x, y); + return pselect(y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, cst_pos_inf, + pselect(pow_is_zero, cst_zero, + pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); +} + + + +/* polevl (modified for Eigen) + * + * Evaluate polynomial + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N+1]; + * + * y = polevl( x, coef); + * + * + * + * DESCRIPTION: + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + * + * The function p1evl() assumes that coef[N] = 1.0 and is + * omitted from the array. Its calling arguments are + * otherwise the same as polevl(). + * + * + * The Eigen implementation is templatized. For best speed, store + * coef as a const array (constexpr), e.g. + * + * const double coef[] = {1.0, 2.0, 3.0, ...}; + * + */ +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); + } +}; + +template +struct ppolevl { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { + EIGEN_UNUSED_VARIABLE(x); + return pset1(coeff[0]); + } +}; + +/* chbevl (modified for Eigen) + * + * Evaluate Chebyshev series + * + * + * + * SYNOPSIS: + * + * int N; + * Scalar x, y, coef[N], chebevl(); + * + * y = chbevl( x, coef, N ); + * + * + * + * DESCRIPTION: + * + * Evaluates the series + * + * N-1 + * - ' + * y = > coef[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero + * order term is last in the array. Note N is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must + * have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), + * over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in + * which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, + * this becomes x -> 4a/x - 1. + * + * + * + * SPEED: + * + * Taking advantage of the recurrence properties of the + * Chebyshev polynomials, the routine requires one more + * addition per loop than evaluating a nested polynomial of + * the same degree. + * + */ + +template +struct pchebevl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits::type coef[]) { + typedef typename unpacket_traits::type Scalar; + Packet b0 = pset1(coef[0]); + Packet b1 = pset1(static_cast(0.f)); + Packet b2; + + for (int i = 1; i < N; i++) { + b2 = b1; + b1 = b0; + b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); + } + + return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h new file mode 100644 index 00000000..177a04e9 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -0,0 +1,110 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2019 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H +#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H + +namespace Eigen { +namespace internal { + +// Forward declarations of the generic math functions +// implemented in GenericPacketMathFunctions.h +// This is needed to workaround a circular dependency. + +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** Default implementation of pfrexp. + * It is expected to be called by implementers of template<> pfrexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic(const Packet& a, Packet& exponent); + +// Extracts the biased exponent value from Packet p, and casts the results to +// a floating-point Packet type. Used by pfrexp_generic. Override this if +// there is no unpacket_traits::integer_packet. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pfrexp_generic_get_biased_exponent(const Packet& p); + +/** Default implementation of pldexp. + * It is expected to be called by implementers of template<> pldexp. + */ +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +Packet pldexp_generic(const Packet& a, const Packet& exponent); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_float(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_float(const Packet _x); + +/** \internal \returns log(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog_double(const Packet _x); + +/** \internal \returns log2(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet plog2_double(const Packet _x); + +/** \internal \returns log(1 + x) */ +template +Packet generic_plog1p(const Packet& x); + +/** \internal \returns exp(x)-1 */ +template +Packet generic_expm1(const Packet& x); + +/** \internal \returns exp(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_float(const Packet _x); + +/** \internal \returns exp(x) for double precision real numbers */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pexp_double(const Packet _x); + +/** \internal \returns sin(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psin_float(const Packet& x); + +/** \internal \returns cos(x) for single precision float */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pcos_float(const Packet& x); + +/** \internal \returns sqrt(x) for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a); + +template struct ppolevl; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/Half.h b/externals/eigen/Eigen/src/Core/arch/Default/Half.h new file mode 100644 index 00000000..9f8e8cc1 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/Half.h @@ -0,0 +1,942 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_H +#define EIGEN_HALF_H + +#include + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +// When compiling with GPU support, the "__half_raw" base class as well as +// some other routines are defined in the GPU compiler header files +// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr +// As a consequence, we get compile failures when compiling Eigen with +// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building +// Eigen with GPU support + #pragma push_macro("EIGEN_CONSTEXPR") + #undef EIGEN_CONSTEXPR + #define EIGEN_CONSTEXPR +#endif + +#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ + template <> \ + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \ + PACKET_F16 METHOD(const PACKET_F16& _x) { \ + return float2half(METHOD(half2float(_x))); \ + } + +namespace Eigen { + +struct half; + +namespace half_impl { + +// We want to use the __half_raw struct from the HIP header file only during the device compile phase. +// This is required because of a quirk in the way TensorFlow GPU builds are done. +// When compiling TensorFlow source code with GPU support, files that +// * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc +// * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically) +// +// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that +// * are defined in a file that gets compiled via hipcc AND +// * have Eigen::half as a pass-by-value argument AND +// * are called in a file that gets compiled via gcc +// +// In the scenario described above the caller and callee will see different versions +// of the Eigen::half base class __half_raw, and they will be compiled by different compilers +// +// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in +// the callee getting corrupted values for the Eigen::half argument. +// +// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves +// this error, and hence the following convoluted #if condition +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { +#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)) + // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF) + // The element type for shared memory cannot have non-trivial constructors + // and hence the following special casing (which skips the zero-initilization). + // Note that this check gets done even in the host compilation phase, and + // hence the need for this + EIGEN_DEVICE_FUNC __half_raw() {} +#else + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {} +#endif +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) { + } + __fp16 x; +#else + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {} + numext::uint16_t x; +#endif +}; + +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER < 90000 + // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw + typedef __half __half_raw; + #endif // defined(EIGEN_HAS_CUDA_FP16) +#elif defined(SYCL_DEVICE_ONLY) + typedef cl::sycl::half __half_raw; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base() {} + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); } + #elif defined(EIGEN_HAS_CUDA_FP16) + #if EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE) + // Use the same base class for the following two scenarios + // * when compiling without GPU support enabled + // * during host compile phase when compiling with GPU support enabled + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + // Nothing to do here + // HIP fp16 header file has a definition for __half_raw +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so + // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within + // #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {} + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif +#endif + + + explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(T val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + // Following the convention of numpy, converting between complex and + // float will lead to loss of imag value. + template + explicit EIGEN_DEVICE_FUNC half(std::complex c) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast(c.real()))) {} + + EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless. + return half_impl::half_to_float(*this); + } + +#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE) + EIGEN_DEVICE_FUNC operator __half() const { + ::__half_raw hr; + hr.x = x; + return __half(hr); + } +#endif +}; + +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + +namespace half_impl { + +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) +// Note: We deliberatly do *not* define this to 1 even if we have Arm's native +// fp16 type since GPU halfs are rather different from native CPU halfs. +// TODO: Rename to something like EIGEN_HAS_NATIVE_GPU_FP16 +#define EIGEN_HAS_NATIVE_FP16 +#endif + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +#if defined(EIGEN_HAS_NATIVE_FP16) +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hadd(::__half(a), ::__half(b)); +#else + return __hadd(a, b); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 + return __hdiv(a, b); +#else + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} +#endif + +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(vaddh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(vmulh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(vsubh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(vdivh_f16(a.x, b.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + return half(vnegh_f16(a.x)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(vaddh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(vmulh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(vsubh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(vdivh_f16(a.x, b.x)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return !vceqh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return vclth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return vcleh_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return vcgth_f16(a.x, b.x); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return vcgeh_f16(a.x, b.x); +} +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats + +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) +// We need to provide emulated *host-side* FP16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif + +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion +// to/from fp32. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return numext::equal_strict(float(a),float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return numext::not_equal_strict(float(a), float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#if defined(__clang__) && defined(__CUDA__) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) { + a += half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) { + a -= half(1); + return a; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) { + half original_value = a; + ++a; + return original_value; +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) { + half original_value = a; + --a; + return original_value; +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) { + // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type + // in the hip_fp16 header file, and that will trigger a compile error + // On the other hand, having anything but a return statement also triggers a compile error + // because this is constexpr function. + // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out + // of this catch22 by having separate bodies for GPU / non GPU +#if defined(EIGEN_HAS_GPU_FP16) + __half_raw h; + h.x = x; + return h; +#else + return __half_raw(x); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) { + // HIP/CUDA/Default have a member 'x' of type uint16_t. + // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast. + // For SYCL, cl::sycl::half is _Float16, so cast directly. +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return numext::bit_cast(h.x); +#elif defined(SYCL_DEVICE_ONLY) + return numext::bit_cast(h); +#else + return h.x; +#endif +} + +union float32_bits { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + __half_raw h; + h.x = static_cast<__fp16>(ff); + return h; + +#else + float32_bits f; f.f = ff; + + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(h); +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return static_cast(h.x); +#else + const float32_bits magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + float32_bits o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { +#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + return (numext::bit_cast(a.x) & 0x7fff) == 0x7c00; +#else + return (a.x & 0x7fff) == 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hisnan(a); +#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return (numext::bit_cast(a.x) & 0x7fff) > 0x7c00; +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + return half(vabsh_f16(a.x)); +#else + half result; + result.x = a.x & 0x7FFF; + return result; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return half(::hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) { + return half(static_cast(EIGEN_LOG2E) * ::logf(float(a))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { + return half(::asinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { + return half(::acosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { + return half(::rintf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { + return half(::roundf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) { + return half(::fmodf(float(a), float(b))); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +#ifndef EIGEN_NO_IO +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} +#endif + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits + : GenericNumTraits +{ + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { + return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7e00); + } +}; + +} // end namespace Eigen + +#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #pragma pop_macro("EIGEN_CONSTEXPR") +#endif + +namespace Eigen { +namespace numext { + +#if defined(EIGEN_GPU_COMPILE_PHASE) + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +#endif + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast(const uint16_t& src) { + return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src)); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(const Eigen::half& src) { + return Eigen::half_impl::raw_half_as_uint16(src); +} + +} // namespace numext +} // namespace Eigen + +// Add the missing shfl* intrinsics. +// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. +// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) +// +// HIP and CUDA prior to SDK 9.0 define +// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float +// CUDA since 9.0 deprecates those and instead defines +// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, +// with native support for __half and __nv_bfloat16 +// +// Note that the following are __device__ - only functions. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \ + || defined(EIGEN_HIPCC) + +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_sync(mask, h, srcLane, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_up_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_down_sync(mask, h, delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { + const __half h = var; + return static_cast(__shfl_xor_sync(mask, h, laneMask, width)); +} + +#else // HIP or CUDA SDK < 9.0 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl(ivar, srcLane, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_up(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_down(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_xor(ivar, laneMask, width))); +} + +#endif // HIP vs CUDA +#endif // __shfl* + +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \ + || defined(EIGEN_HIPCC) +EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); +} +#endif // __ldg + +#if EIGEN_HAS_STD_HASH +namespace std { +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast(Eigen::numext::bit_cast(a)); + } +}; +} // end namespace std +#endif + +#endif // EIGEN_HALF_H diff --git a/externals/eigen/Eigen/src/Core/arch/Default/Settings.h b/externals/eigen/Eigen/src/Core/arch/Default/Settings.h index 097373c8..a5c3ada4 100644 --- a/externals/eigen/Eigen/src/Core/arch/Default/Settings.h +++ b/externals/eigen/Eigen/src/Core/arch/Default/Settings.h @@ -21,7 +21,7 @@ * it does not correspond to the number of iterations or the number of instructions */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 100 +#define EIGEN_UNROLLING_LIMIT 110 #endif /** Defines the threshold between a "small" and a "large" matrix. diff --git a/externals/eigen/Eigen/src/Core/arch/Default/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/Default/TypeCasting.h new file mode 100644 index 00000000..fb8183b7 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/Default/TypeCasting.h @@ -0,0 +1,120 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2019 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GENERIC_TYPE_CASTING_H +#define EIGEN_GENERIC_TYPE_CASTING_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const float& a) const { + return Eigen::bfloat16(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 operator() (const int& a) const { + return Eigen::bfloat16(static_cast(a)); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::bfloat16& a) const { + return static_cast(a); + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +} +} + +#endif // EIGEN_GENERIC_TYPE_CASTING_H diff --git a/externals/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h similarity index 82% rename from externals/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h rename to externals/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h index 0348b41d..d2b3a256 100644 --- a/externals/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/GPU/MathFunctions.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H -#define EIGEN_MATH_FUNCTIONS_CUDA_H +#ifndef EIGEN_MATH_FUNCTIONS_GPU_H +#define EIGEN_MATH_FUNCTIONS_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { @@ -56,6 +56,18 @@ double2 pexp(const double2& a) return make_double2(exp(a.x), exp(a.y)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pexpm1(const float4& a) +{ + return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pexpm1(const double2& a) +{ + return make_double2(expm1(a.x), expm1(a.y)); +} + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt(const float4& a) { @@ -88,4 +100,4 @@ double2 prsqrt(const double2& a) } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_CUDA_H +#endif // EIGEN_MATH_FUNCTIONS_GPU_H diff --git a/externals/eigen/Eigen/src/Core/arch/GPU/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/GPU/PacketMath.h new file mode 100644 index 00000000..689110de --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/GPU/PacketMath.h @@ -0,0 +1,1685 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_GPU_H +#define EIGEN_PACKET_MATH_GPU_H + +namespace Eigen { + +namespace internal { + +// Read-only data cached load available. +#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) +#define EIGEN_GPU_HAS_LDG 1 +#endif + +// FP16 math available. +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) +#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1 +#endif + +#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) +#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 +#endif + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasNdtri = 1, + HasBessel = 1, + HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + HasFloor = 1, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasNdtri = 1, + HasBessel = 1, + HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + HasFloor = 1, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + +// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, +// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation +// of the functions, while the latter can only deal with one of them. +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) | __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) | + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) ^ + __double_as_longlong(b)); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, + const float& b) { + return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, + const double& b) { + return __longlong_as_double(__double_as_longlong(a) & + ~__double_as_longlong(b)); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, + const float& b) { + return __int_as_float(a == b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, + const double& b) { + return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, + const float& b) { + return __int_as_float(a < b ? 0xffffffffu : 0u); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, + const double& b) { + return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); +} + +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, + const float4& b) { + return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), + bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand(const double2& a, + const double2& b) { + return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por(const float4& a, + const float4& b) { + return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), + bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por(const double2& a, + const double2& b) { + return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor(const float4& a, + const float4& b) { + return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), + bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor(const double2& a, + const double2& b) { + return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot(const float4& a, + const float4& b) { + return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), + bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pandnot(const double2& a, const double2& b) { + return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq(const float4& a, + const float4& b) { + return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), + eq_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt(const float4& a, + const float4& b) { + return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), + lt_mask(a.w, b.w)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_eq(const double2& a, const double2& b) { + return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pcmp_lt(const double2& a, const double2& b) { + return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); +} +#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return make_double2(__ldg(from+0), __ldg(from+1)); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { + return a.x * a.y * a.z * a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { + return a.x * a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(fabs(a.x), fabs(a.y)); +} + +template<> EIGEN_DEVICE_FUNC inline float4 pfloor(const float4& a) { + return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pfloor(const double2& a) { + return make_double2(floor(a.x), floor(a.y)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + +// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning +// its corresponding packet_traits must be visible on host. +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + +typedef ulonglong2 Packet4h2; +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4h2 type; + typedef Packet4h2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +namespace { +// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __halves2half2(a, b); +#else + // Round-about way since __halves2half2 is a __device__ function. + return __floats2half2_rn(__half2float(a), __half2float(b)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __low2half(a); +#else + return __float2half(__low2float(a)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __high2half(a); +#else + return __float2half(__high2float(a)); +#endif +} +} // namespace + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __half2half2(from); +#else + const float f = __half2float(from); + return __floats2half2_rn(f, f); +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pset1(const Eigen::half& from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = pset1(from); + p_alias[1] = pset1(from); + p_alias[2] = pset1(from); + p_alias[3] = pset1(from); + return r; +} + +// We now need this visible on both host and device. +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +namespace { + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return combine_half(from[0], from[1]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return combine_half(from[0], from[0]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, + const half2& from) { + *reinterpret_cast(to) = from; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, + const half2& from) { + to[0] = get_half2_low(from); + to[1] = get_half2_high(from); +} + + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( + const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + // Input is guaranteed to be properly aligned. + return __ldg(reinterpret_cast(from)); +#else + return combine_half(*(from+0), *(from+1)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( + const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return combine_half(*(from+0), *(from+1)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, + Index stride) { + return combine_half(from[0*stride], from[1*stride]); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( + Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = get_half2_low(from); + to[stride*1] = get_half2_high(from); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return get_half2_low(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); + half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = get_half2_low(kernel.packet[0]); + __half a2 = get_half2_high(kernel.packet[0]); + __half b1 = get_half2_low(kernel.packet[1]); + __half b2 = get_half2_high(kernel.packet[1]); + kernel.packet[0] = combine_half(a1, b1); + kernel.packet[1] = combine_half(a2, b2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return combine_half(a, __float2half(f)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, + const half2& a, + const half2& b) { + half mask_low = get_half2_low(mask); + half mask_high = get_half2_high(mask); + half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); + half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); + return combine_half(result_low, result_high); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; + return combine_half(eq1, eq2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, + const half2& b) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + half false_half = half_impl::raw_uint16_to_half(0x0000u); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; + half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; + return combine_half(eq1, eq2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, + const half2& b) { + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); + half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); + half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); + return combine_half(result1, result2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, + const half2& b, + const half2& c) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hfma2(a, b, c); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c1; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __h2div(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 + a2)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 > a2 ? get_half2_low(a) : get_half2_high(a); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 < a2 ? get_half2_low(a) : get_half2_high(a); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half(a1 * a2)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#else + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = logf(a1); + float r2 = logf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expf(a1); + float r2 = expf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = sqrtf(a1); + float r2 = sqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = rsqrtf(a1); + float r2 = rsqrtf(a2); + return __floats2half2_rn(r1, r2); +} +#endif +} // namespace + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +// unaligned load; +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +ploadu(const Eigen::half* from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = ploadu(from + 0); + p_alias[1] = ploadu(from + 2); + p_alias[2] = ploadu(from + 4); + p_alias[3] = ploadu(from + 6); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +ploaddup(const Eigen::half* from) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = ploaddup(from + 0); + p_alias[1] = ploaddup(from + 1); + p_alias[2] = ploaddup(from + 2); + p_alias[3] = ploaddup(from + 3); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore( + Eigen::half* to, const Packet4h2& from) { + *reinterpret_cast(to) = from; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu( + Eigen::half* to, const Packet4h2& from) { + const half2* from_alias = reinterpret_cast(&from); + pstoreu(to + 0,from_alias[0]); + pstoreu(to + 2,from_alias[1]); + pstoreu(to + 4,from_alias[2]); + pstoreu(to + 6,from_alias[3]); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 +ploadt_ro(const Eigen::half* from) { +#if defined(EIGEN_GPU_HAS_LDG) + Packet4h2 r; + r = __ldg(reinterpret_cast(from)); + return r; +#else + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + r_alias[0] = ploadt_ro_aligned(from + 0); + r_alias[1] = ploadt_ro_aligned(from + 2); + r_alias[2] = ploadt_ro_aligned(from + 4); + r_alias[3] = ploadt_ro_aligned(from + 6); + return r; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 +ploadt_ro(const Eigen::half* from) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + r_alias[0] = ploadt_ro_unaligned(from + 0); + r_alias[1] = ploadt_ro_unaligned(from + 2); + r_alias[2] = ploadt_ro_unaligned(from + 4); + r_alias[3] = ploadt_ro_unaligned(from + 6); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pgather(const Eigen::half* from, Index stride) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); + p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); + p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); + p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( + Eigen::half* to, const Packet4h2& from, Index stride) { + const half2* from_alias = reinterpret_cast(&from); + pscatter(to + stride * 0, from_alias[0], stride); + pscatter(to + stride * 2, from_alias[1], stride); + pscatter(to + stride * 4, from_alias[2], stride); + pscatter(to + stride * 6, from_alias[3], stride); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst( + const Packet4h2& a) { + return pfirst(*(reinterpret_cast(&a))); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs( + const Packet4h2& a) { + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + p_alias[0] = pabs(a_alias[0]); + p_alias[1] = pabs(a_alias[1]); + p_alias[2] = pabs(a_alias[2]); + p_alias[3] = pabs(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue( + const Packet4h2& /*a*/) { + half true_half = half_impl::raw_uint16_to_half(0xffffu); + return pset1(true_half); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero(const Packet4h2& /*a*/) { + half false_half = half_impl::raw_uint16_to_half(0x0000u); + return pset1(false_half); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double( + double* d_row0, double* d_row1, double* d_row2, double* d_row3, + double* d_row4, double* d_row5, double* d_row6, double* d_row7) { + double d_tmp; + d_tmp = d_row0[1]; + d_row0[1] = d_row4[0]; + d_row4[0] = d_tmp; + + d_tmp = d_row1[1]; + d_row1[1] = d_row5[0]; + d_row5[0] = d_tmp; + + d_tmp = d_row2[1]; + d_row2[1] = d_row6[0]; + d_row6[0] = d_tmp; + + d_tmp = d_row3[1]; + d_row3[1] = d_row7[0]; + d_row7[0] = d_tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( + half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) { + half2 f_tmp; + f_tmp = f_row0[1]; + f_row0[1] = f_row2[0]; + f_row2[0] = f_tmp; + + f_tmp = f_row1[1]; + f_row1[1] = f_row3[0]; + f_row3[0] = f_tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose_half(half2& f0, half2& f1) { + __half a1 = get_half2_low(f0); + __half a2 = get_half2_high(f0); + __half b1 = get_half2_low(f1); + __half b2 = get_half2_high(f1); + f0 = combine_half(a1, b1); + f1 = combine_half(a2, b2); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + double* d_row0 = reinterpret_cast(&kernel.packet[0]); + double* d_row1 = reinterpret_cast(&kernel.packet[1]); + double* d_row2 = reinterpret_cast(&kernel.packet[2]); + double* d_row3 = reinterpret_cast(&kernel.packet[3]); + double* d_row4 = reinterpret_cast(&kernel.packet[4]); + double* d_row5 = reinterpret_cast(&kernel.packet[5]); + double* d_row6 = reinterpret_cast(&kernel.packet[6]); + double* d_row7 = reinterpret_cast(&kernel.packet[7]); + ptranspose_double(d_row0, d_row1, d_row2, d_row3, + d_row4, d_row5, d_row6, d_row7); + + + half2* f_row0 = reinterpret_cast(d_row0); + half2* f_row1 = reinterpret_cast(d_row1); + half2* f_row2 = reinterpret_cast(d_row2); + half2* f_row3 = reinterpret_cast(d_row3); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row0 + 1); + f_row1 = reinterpret_cast(d_row1 + 1); + f_row2 = reinterpret_cast(d_row2 + 1); + f_row3 = reinterpret_cast(d_row3 + 1); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row4); + f_row1 = reinterpret_cast(d_row5); + f_row2 = reinterpret_cast(d_row6); + f_row3 = reinterpret_cast(d_row7); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + + f_row0 = reinterpret_cast(d_row4 + 1); + f_row1 = reinterpret_cast(d_row5 + 1); + f_row2 = reinterpret_cast(d_row6 + 1); + f_row3 = reinterpret_cast(d_row7 + 1); + ptranspose_half2(f_row0, f_row1, f_row2, f_row3); + ptranspose_half(f_row0[0], f_row1[0]); + ptranspose_half(f_row0[1], f_row1[1]); + ptranspose_half(f_row2[0], f_row3[0]); + ptranspose_half(f_row2[1], f_row3[1]); + +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f))); + p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), + __hadd(a, __float2half(3.0f))); + p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), + __hadd(a, __float2half(5.0f))); + p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), + __hadd(a, __float2half(7.0f))); + return r; +#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + + half2 b = pset1(a); + half2 c; + half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f)); + half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f)); + + c = __hadd2(b, half_offset0); + r_alias[0] = plset(__low2half(c)); + r_alias[1] = plset(__high2half(c)); + + c = __hadd2(b, half_offset1); + r_alias[2] = plset(__low2half(c)); + r_alias[3] = plset(__high2half(c)); + + return r; + +#else + float f = __half2float(a); + Packet4h2 r; + half2* p_alias = reinterpret_cast(&r); + p_alias[0] = combine_half(a, __float2half(f + 1.0f)); + p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + return r; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pselect(const Packet4h2& mask, const Packet4h2& a, + const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* mask_alias = reinterpret_cast(&mask); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]); + r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]); + r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]); + r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pcmp_eq(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]); + r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]); + r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]); + r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pand(a_alias[0], b_alias[0]); + r_alias[1] = pand(a_alias[1], b_alias[1]); + r_alias[2] = pand(a_alias[2], b_alias[2]); + r_alias[3] = pand(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = por(a_alias[0], b_alias[0]); + r_alias[1] = por(a_alias[1], b_alias[1]); + r_alias[2] = por(a_alias[2], b_alias[2]); + r_alias[3] = por(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pxor(a_alias[0], b_alias[0]); + r_alias[1] = pxor(a_alias[1], b_alias[1]); + r_alias[2] = pxor(a_alias[2], b_alias[2]); + r_alias[3] = pxor(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pandnot(const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pandnot(a_alias[0], b_alias[0]); + r_alias[1] = pandnot(a_alias[1], b_alias[1]); + r_alias[2] = pandnot(a_alias[2], b_alias[2]); + r_alias[3] = pandnot(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = padd(a_alias[0], b_alias[0]); + r_alias[1] = padd(a_alias[1], b_alias[1]); + r_alias[2] = padd(a_alias[2], b_alias[2]); + r_alias[3] = padd(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = psub(a_alias[0], b_alias[0]); + r_alias[1] = psub(a_alias[1], b_alias[1]); + r_alias[2] = psub(a_alias[2], b_alias[2]); + r_alias[3] = psub(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pnegate(a_alias[0]); + r_alias[1] = pnegate(a_alias[1]); + r_alias[2] = pnegate(a_alias[2]); + r_alias[3] = pnegate(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) { + return a; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmul(a_alias[0], b_alias[0]); + r_alias[1] = pmul(a_alias[1], b_alias[1]); + r_alias[2] = pmul(a_alias[2], b_alias[2]); + r_alias[3] = pmul(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd( + const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + const half2* c_alias = reinterpret_cast(&c); + r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]); + r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]); + r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]); + r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pdiv(a_alias[0], b_alias[0]); + r_alias[1] = pdiv(a_alias[1], b_alias[1]); + r_alias[2] = pdiv(a_alias[2], b_alias[2]); + r_alias[3] = pdiv(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmin(a_alias[0], b_alias[0]); + r_alias[1] = pmin(a_alias[1], b_alias[1]); + r_alias[2] = pmin(a_alias[2], b_alias[2]); + r_alias[3] = pmin(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax( + const Packet4h2& a, const Packet4h2& b) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + const half2* b_alias = reinterpret_cast(&b); + r_alias[0] = pmax(a_alias[0], b_alias[0]); + r_alias[1] = pmax(a_alias[1], b_alias[1]); + r_alias[2] = pmax(a_alias[2], b_alias[2]); + r_alias[3] = pmax(a_alias[3], b_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + + return predux(a_alias[0]) + predux(a_alias[1]) + + predux(a_alias[2]) + predux(a_alias[3]); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + half2 m0 = combine_half(predux_max(a_alias[0]), + predux_max(a_alias[1])); + half2 m1 = combine_half(predux_max(a_alias[2]), + predux_max(a_alias[3])); + __half first = predux_max(m0); + __half second = predux_max(m1); +#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + return (__hgt(first, second) ? first : second); +#else + float ffirst = __half2float(first); + float fsecond = __half2float(second); + return (ffirst > fsecond)? first: second; +#endif +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + half2 m0 = combine_half(predux_min(a_alias[0]), + predux_min(a_alias[1])); + half2 m1 = combine_half(predux_min(a_alias[2]), + predux_min(a_alias[3])); + __half first = predux_min(m0); + __half second = predux_min(m1); +#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC) + return (__hlt(first, second) ? first : second); +#else + float ffirst = __half2float(first); + float fsecond = __half2float(second); + return (ffirst < fsecond)? first: second; +#endif +} + +// likely overflow/underflow +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul( + const Packet4h2& a) { + const half2* a_alias = reinterpret_cast(&a); + return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), + pmul(a_alias[2], a_alias[3]))); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +plog1p(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = plog1p(a_alias[0]); + r_alias[1] = plog1p(a_alias[1]); + r_alias[2] = plog1p(a_alias[2]); + r_alias[3] = plog1p(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +pexpm1(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pexpm1(a_alias[0]); + r_alias[1] = pexpm1(a_alias[1]); + r_alias[2] = pexpm1(a_alias[2]); + r_alias[3] = pexpm1(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = plog(a_alias[0]); + r_alias[1] = plog(a_alias[1]); + r_alias[2] = plog(a_alias[2]); + r_alias[3] = plog(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = pexp(a_alias[0]); + r_alias[1] = pexp(a_alias[1]); + r_alias[2] = pexp(a_alias[2]); + r_alias[3] = pexp(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = psqrt(a_alias[0]); + r_alias[1] = psqrt(a_alias[1]); + r_alias[2] = psqrt(a_alias[2]); + r_alias[3] = psqrt(a_alias[3]); + return r; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 +prsqrt(const Packet4h2& a) { + Packet4h2 r; + half2* r_alias = reinterpret_cast(&r); + const half2* a_alias = reinterpret_cast(&a); + r_alias[0] = prsqrt(a_alias[0]); + r_alias[1] = prsqrt(a_alias[1]); + r_alias[2] = prsqrt(a_alias[2]); + r_alias[3] = prsqrt(a_alias[3]); + return r; +} + +// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for +// the implementation of GPU half reduction. +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, + const half2& b) { +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) + return __h2div(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, + const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); +} + +// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) + +#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + +#undef EIGEN_GPU_HAS_LDG +#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC +#undef EIGEN_GPU_HAS_FP16_ARITHMETIC + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_GPU_H diff --git a/externals/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h new file mode 100644 index 00000000..75454622 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -0,0 +1,80 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H + +namespace Eigen { + +namespace internal { + +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast(const float4& a, const float4& b) { + Packet4h2 r; + half2* r_alias=reinterpret_cast(&r); + r_alias[0]=__floats2half2_rn(a.x,a.y); + r_alias[1]=__floats2half2_rn(a.z,a.w); + r_alias[2]=__floats2half2_rn(b.x,b.y); + r_alias[3]=__floats2half2_rn(b.z,b.w); + return r; +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const Packet4h2& a) { + // Simply discard the second half of the input + float4 r; + const half2* a_alias=reinterpret_cast(&a); + float2 r1 = __half22float2(a_alias[0]); + float2 r2 = __half22float2(a_alias[1]); + r.x=static_cast(r1.x); + r.y=static_cast(r1.y); + r.z=static_cast(r2.x); + r.w=static_cast(r2.y); + return r; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/externals/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/externals/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 00000000..25375a0a --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/externals/eigen/Eigen/src/Core/arch/MSA/Complex.h b/externals/eigen/Eigen/src/Core/arch/MSA/Complex.h new file mode 100644 index 00000000..53dacfa4 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/MSA/Complex.h @@ -0,0 +1,648 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_MSA_H +#define EIGEN_COMPLEX_MSA_H + +#include + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet2cf { + EIGEN_STRONG_INLINE Packet2cf() { + } + EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex& a, + const std::complex& b) { + Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) }; + v = t; + } + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf conjugate(void) const { + return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63)); + } + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + Packet4f v1, v2; + + // Get the real values of a | a1_re | a1_re | a2_re | a2_re | + v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v); + // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | + v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet2cf(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this *= b.conjugate(); + Packet4f s = pmul(b.v, b.v); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { + return Packet2cf(*this) /= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(pnegate(v)); + } + + Packet4f v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] + << "i)," + " (" + << value.v[2] << ", " << value.v[3] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet2cf half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + float f0 = from.real(), f1 = from.imag(); + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { + EIGEN_MSA_DEBUG; + + return Packet2cf(from[0 * stride], from[1 * stride]); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, + const Packet2cf& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = std::complex(from.v[0], from.v[1]); + to += stride; + *to = std::complex(from.v[2], from.v[3]); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + Packet4f value = (Packet4f)preverse((Packet2d)a.v); + value += a.v; + return std::complex(value[0], value[1]); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), + (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) + +template <> +EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet4f tmp = + (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[0].v = + (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[1].v = tmp; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, + const Packet2cf& elsePacket) { + return (Packet2cf)(Packet4f)pblend(ifPacket, (Packet2d)thenPacket.v, + (Packet2d)elsePacket.v); +} + +//---------- double ---------- + +struct Packet1cd { + EIGEN_STRONG_INLINE Packet1cd() { + } + EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex& a) { + v[0] = std::real(a); + v[1] = std::imag(a); + } + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd conjugate(void) const { + static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 }; + return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR); + } + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + Packet2d v1, v2; + + // Get the real values of a | a1_re | a1_re + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v); + // Get the imag values of a | a1_im | a1_im + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet1cd(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) { + *this *= b.conjugate(); + Packet2d s = pmul(b.v, b.v); + s = padd(s, preverse(s)); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { + return Packet1cd(*this) /= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(pnegate(v)); + } + + Packet2d v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet1cd half; +}; + +template <> +EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + return Packet1cd(from); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + Packet1cd res; + res.v[0] = std::real(from[0]); + res.v[1] = std::imag(from[0]); + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, + const Packet1cd& from, + Index stride + __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + pstore(to, from); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) + +template <> +EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip /**/ (const Packet1cd& x) { + EIGEN_MSA_DEBUG; + + return Packet1cd(preverse(Packet2d(x.v))); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d v1, v2; + + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + // Get the imag values of a + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + + kernel.packet[0].v = v1; + kernel.packet[1].v = v2; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_MSA_H diff --git a/externals/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h new file mode 100644 index 00000000..f5181b90 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -0,0 +1,387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2016 Gael Guennebaud +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +/* The tanh function of this file is an adaptation of + * template T generic_fast_tanh_float(const T&) + * from MathFunctionsImpl.h. + */ + +#ifndef EIGEN_MATH_FUNCTIONS_MSA_H +#define EIGEN_MATH_FUNCTIONS_MSA_H + +namespace Eigen { + +namespace internal { + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +plog(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + // Convert negative argument into NAN (quiet negative, to be specific). + Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); + Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero); + Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero); + Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN. + Packet4f x = non_neg_x_or_nan; + + // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0. + // N.B. the exponent is one less of what frexpf() would return. + Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x)); + // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf(). + x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0)); + + /* + if (x < SQRTHF) { + x = x + x - 1.0; + } else { + e += 1; + x = x - 1.0; + } + */ + Packet4f xx = padd(x, x); + Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x); + e_int = psub(e_int, ge_mask); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x); + x = psub(x, p4f_1); + Packet4f e = __builtin_msa_ffint_s_w(e_int); + + Packet4f x2 = pmul(x, x); + Packet4f x3 = pmul(x2, x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y, x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(e, p4f_cephes_log_q1, y); + x = __builtin_msa_fmsub_w(x, x2, p4f_half); + x = padd(x, y); + x = pmadd(e, p4f_cephes_log_q2, x); + + // x is now the logarithm result candidate. We still need to handle the + // extreme arguments of zero and positive infinity, though. + // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms + // contain infinities of both signs (see the coefficients and code above). + // INFINITY - INFINITY is NAN. + + // If the argument is +INFINITY, make it the new result candidate. + // To achieve that we choose the smaller of the result candidate and the + // argument. + // This is correct for all finite pairs of values (the logarithm is smaller + // than the argument). + // This is also correct in the special case when the argument is +INFINITY + // and the result candidate is NAN. This is because the fmin.df instruction + // prefers non-NANs to NANs. + x = __builtin_msa_fmin_w(x, non_neg_x_or_nan); + + // If the argument is zero (including -0.0), the result becomes -INFINITY. + Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs); + + return x; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pexp(const Packet4f& _x) { + // Limiting single-precision pexp's argument to [-128, +128] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = _x; + + // Clamp x. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, + (v16u8)p4f_exp_lo); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, + (v16u8)p4f_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0); + Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add); + Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2); + Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int); + + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1); + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2); + + Packet4f z = pmul(x, x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // y *= 2**exponent. + y = __builtin_msa_fexp2_w(y, x2_int); + + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); + // The monomial coefficients of the numerator polynomial (odd). + static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + // The monomial coefficients of the denominator polynomial (even). + static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); + + Packet4f x = pabs(_x); + Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); + + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, + (v16u8)p4f_tanh_hi); + + // Since the polynomials are odd/even, we need x**2. + Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + p = pdiv(p, q); + + // Reinstate the sign. + p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0); + + // When the argument is very small in magnitude it's more accurate to just return it. + p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x); + + return p; +} + +template +Packet4f psincos_inner_msa_float(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = pabs(_x); + + // Translate infinite arguments into NANs. + Packet4f zero_or_nan_if_inf = psub(_x, _x); + x = padd(x, zero_or_nan_if_inf); + // Prevent sin/cos from generating values larger than 1.0 in magnitude + // for very large arguments by setting x to 0.0. + Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg); + x = pand(x, (Packet4f)small_or_nan_mask); + + // Scale x by 4/Pi to find x's octant. + Packet4f y = pmul(x, p4f_cephes_FOPI); + // Get the octant. We'll reduce x by this number of octants or by one more than it. + Packet4i y_int = __builtin_msa_ftrunc_s_w(y); + // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. + // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. + // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). + Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); + Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear + y = __builtin_msa_ffint_s_w(y_int2); + + // Compute the sign to apply to the polynomial. + Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x) + : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29); + + // Get the polynomial selection mask. + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0); + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1); + Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2); + Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, tmp1); + x = padd(x, tmp2); + x = padd(x, tmp3); + + // Evaluate the cos(x) polynomial. + y = p4f_coscof_p0; + Packet4f z = pmul(x, x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + y = __builtin_msa_fmsub_w(y, z, p4f_half); + y = padd(y, p4f_1); + + // Evaluate the sin(x) polynomial. + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2) + : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y); + + // Update the sign. + sign_mask = pxor(sign_mask, (Packet4i)y); + y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +psin(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pcos(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d +pexp(const Packet2d& _x) { + // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); + static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); + + Packet2d x = _x; + + // Clamp x. + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, + (v16u8)p2d_exp_lo); + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, + (v16u8)p2d_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0); + Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add); + Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2); + Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long); + + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1); + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2); + + x2 = pmul(x, x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul(px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px, psub(qx, px)); + x = pmadd(p2d_2, x, p2d_1); + + // x *= 2**exponent. + x = __builtin_msa_fexp2_d(x, x2_long); + + return x; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_MSA_H diff --git a/externals/eigen/Eigen/src/Core/arch/MSA/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/MSA/PacketMath.h new file mode 100644 index 00000000..afe8f337 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/MSA/PacketMath.h @@ -0,0 +1,1233 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_MSA_H +#define EIGEN_PACKET_MATH_MSA_H + +#include +#include + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +#if 0 +#define EIGEN_MSA_DEBUG \ + static bool firstTime = true; \ + do { \ + if (firstTime) { \ + std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ + firstTime = false; \ + } \ + } while (0) +#else +#define EIGEN_MSA_DEBUG +#endif + +#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) + +typedef v4f32 Packet4f; +typedef v4i32 Packet4i; +typedef v4u32 Packet4ui; + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4f type; + typedef Packet4f half; // Packet2f intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2f intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4i type; + typedef Packet4i half; // Packet2i intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2i intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet4f half; +}; + +template <> +struct unpacket_traits { + typedef int32_t type; + enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet4i half; +}; + +template <> +EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + EIGEN_MSA_DEBUG; + + Packet4f v = { from, from, from, from }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload1(const float* from) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload1(const int32_t* from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(*from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f plset(const float& a) { + EIGEN_MSA_DEBUG; + + static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { + EIGEN_MSA_DEBUG; + + static const Packet4i countdown = { 0, 1, 2, 3 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_subv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_mulv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_div_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_w(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + EIGEN_MSA_DEBUG; + + // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. + Packet4i value = c; + __asm__("maddv.w %w[value], %w[a], %w[b]\n" + // Outputs + : [value] "+f"(value) + // Inputs + : [a] "f"(a), [b] "f"(b)); + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_min_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_max_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { + EIGEN_MSA_DEBUG; + + float f0 = from[0], f1 = from[1]; + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { + EIGEN_MSA_DEBUG; + + int32_t i0 = from[0], i1 = from[1]; + Packet4i v0 = { i0, i0, i0, i0 }; + Packet4i v1 = { i1, i1, i1, i1 }; + return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { + EIGEN_MSA_DEBUG; + + int32_t i = *from; + Packet4i v = { i, i, i, i }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const float* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i zero = __builtin_msa_ldi_w(0); + return __builtin_msa_add_a_w(zero, a); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + + +template <> +EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with min computation. + Packet4f v = __builtin_msa_fmin_w(a, swapped); + v = __builtin_msa_fmin_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +// max +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with max computation. + Packet4f v = __builtin_msa_fmax_w(a, swapped); + v = __builtin_msa_fmax_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + + kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_w(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_w(a); +#else + Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, + const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, + const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +//---------- double ---------- + +typedef v2f64 Packet2d; +typedef v2i64 Packet2l; +typedef v2u64 Packet2ul; + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + // FIXME check the Has* + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; + typedef Packet2d half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { from, from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d plset(const double& a) { + EIGEN_MSA_DEBUG; + + static const Packet2d countdown = { 0.0, 1.0 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_d(c, a, b); +} + +// Logical Operations are not supported for float, so we have to reinterpret casts using MSA +// intrinsics +template <> +EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { *from, *from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { + EIGEN_MSA_DEBUG; + + Packet2d value; + value[0] = *from; + from += stride; + value[1] = *from; + return value; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const double* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d s = padd(a, preverse(a)); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d p = pmul(a, preverse(a)); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmin_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1; +#endif +} + +// max +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmax_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_d(a); +#else + Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, + const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); + return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_MSA_H diff --git a/externals/eigen/Eigen/src/Core/arch/NEON/Complex.h b/externals/eigen/Eigen/src/Core/arch/NEON/Complex.h index 57e9b431..f40af7f8 100644 --- a/externals/eigen/Eigen/src/Core/arch/NEON/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/NEON/Complex.h @@ -15,9 +15,10 @@ namespace Eigen { namespace internal { -inline uint32x4_t p4ui_CONJ_XOR() { +inline uint32x4_t p4ui_CONJ_XOR() +{ // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; return ret; #else @@ -26,61 +27,136 @@ inline uint32x4_t p4ui_CONJ_XOR() { #endif } -inline uint32x2_t p2ui_CONJ_XOR() { +inline uint32x2_t p2ui_CONJ_XOR() +{ static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; return vld1_u32( conj_XOR_DATA ); } //---------- float ---------- + +struct Packet1cf +{ + EIGEN_STRONG_INLINE Packet1cf() {} + EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {} + Packet2f v; +}; struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; + Packet4f v; }; -template<> struct packet_traits > : default_packet_traits +template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; - typedef Packet2cf half; - enum { + typedef Packet1cf half; + enum + { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, HasSetLinear = 0 }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; - -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +template<> struct unpacket_traits { - float32x2_t r64; - r64 = vld1_f32((float *)&from); + typedef std::complex type; + typedef Packet1cf half; + typedef Packet2f as_real; + enum + { + size = 1, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cf half; + typedef Packet4f as_real; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> EIGEN_STRONG_INLINE Packet1cf pcast(const float& a) +{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); } +template<> EIGEN_STRONG_INLINE Packet2cf pcast(const Packet2f& a) +{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pset1(const std::complex& from) +{ return Packet1cf(vld1_f32(reinterpret_cast(&from))); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + const float32x2_t r64 = vld1_f32(reinterpret_cast(&from)); return Packet2cf(vcombine_f32(r64, r64)); } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cf padd(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(padd(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(psub(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) +{ + const Packet2ui b = vreinterpret_u32_f32(a.v); + return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); +} template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = vreinterpretq_u32_f32(a.v); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } +template<> EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) +{ + Packet2f v1, v2; + + // Get the real values of a | a1_re | a1_re | + v1 = vdup_lane_f32(a.v, 0); + // Get the imag values of a | a1_im | a1_im | + v2 = vdup_lane_f32(a.v, 1); + // Multiply the real a with b + v1 = vmul_f32(v1, b.v); + // Multiply the imag a with b + v2 = vmul_f32(v2, b.v); + // Conjugate v2 + v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); + // Swap real/imag elements in v2. + v2 = vrev64_f32(v2); + // Add and return the result + return Packet1cf(vadd_f32(v1, v2)); +} template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { Packet4f v1, v2; @@ -93,7 +169,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); @@ -101,98 +177,144 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) +template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0])] + Packet2f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0])] + Packet2f eq_swapped = vrev64_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) -{ - return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] + Packet4f eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] + Packet4f eq_swapped = vrev64q_f32(eq); + // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet2cf(pand(eq, eq_swapped)); } -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cf pand(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cf por(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE Packet1cf pxor(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE Packet1cf pandnot(const Packet1cf& a, const Packet1cf& b) +{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) +{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cf ploaddup(const std::complex* from) +{ return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) +{ return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } + +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet2cf& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather, Packet1cf>( + const std::complex* from, Index stride) +{ + const Packet2f tmp = vdup_n_f32(std::real(from[0*stride])); + return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1)); +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + Packet4f res = vdupq_n_f32(std::real(from[0*stride])); res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); return Packet2cf(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cf>( + std::complex* to, const Packet1cf& from, Index stride) +{ to[stride*0] = std::complex(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>( + std::complex* to, const Packet2cf& from, Index stride) { to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cf& a) +{ + EIGEN_ALIGN16 std::complex x; + vst1_f32(reinterpret_cast(&x), a.v); + return x; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 x[2]; - vst1q_f32((float *)x, a.v); + EIGEN_ALIGN16 std::complex x[2]; + vst1q_f32(reinterpret_cast(x), a.v); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - float32x2_t a_lo, a_hi; - Packet4f a_r128; - - a_lo = vget_low_f32(a.v); - a_hi = vget_high_f32(a.v); - a_r128 = vcombine_f32(a_hi, a_lo); - - return Packet2cf(a_r128); -} +{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip(const Packet1cf& a) +{ return Packet1cf(vrev64_f32(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) +{ return Packet2cf(vrev64q_f32(a.v)); } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cf& a) { - return Packet2cf(vrev64q_f32(a.v)); + std::complex s; + vst1_f32((float *)&s, a.v); + return s; } - template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { - float32x2_t a1, a2; std::complex s; - - a1 = vget_low_f32(a.v); - a2 = vget_high_f32(a.v); - a2 = vadd_f32(a1, a2); - vst1_f32((float *)&s, a2); - + vst1_f32(reinterpret_cast(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v))); return s; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cf& a) { - Packet4f sum1, sum2, sum; - - // Add the first two 64-bit float32x2_t of vecs[0] - sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); - sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); - sum = vaddq_f32(sum1, sum2); - - return Packet2cf(sum); + std::complex s; + vst1_f32((float *)&s, a.v); + return s; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { float32x2_t a1, a2, v1, v2, prod; @@ -208,88 +330,67 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P v1 = vmul_f32(v1, a2); // Multiply the imag a with b v2 = vmul_f32(v2, a2); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 prod = vadd_f32(v1, v2); - vst1_f32((float *)&s, prod); + vst1_f32(reinterpret_cast(&s), prod); return s; } -template -struct palign_impl -{ - EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = vextq_f32(first.v, second.v, 2); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } + // TODO optimize it for NEON + Packet1cf res = pmul(a, pconj(b)); + Packet2f s, rev_s; - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; + // this computes the norm + s = vmul_f32(b.v, b.v); + rev_s = vrev64_f32(s); + return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); +} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a,pconj(b)); Packet4f s, rev_s; // this computes the norm s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet1cf psqrt(const Packet1cf& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); +} + //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; @@ -307,7 +408,8 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; - enum { + enum + { Vectorizable = 1, AlignedOnScalar = 0, size = 1, @@ -326,24 +428,50 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits +{ + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum + { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu(reinterpret_cast(from))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ + /* here we really have to use unaligned loads :( */ + return ploadu(&from); +} -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(padd(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) +{ return Packet1cd(pnegate(a.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d v1, v2; - // Get the real values of a + // Get the real values of a v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); // Get the imag values of a v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); @@ -351,7 +479,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v1 = vmulq_f64(v1, b.v); // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); // Swap real/imag elements in v2. v2 = preverse(v2); @@ -359,31 +487,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); + // Compare real and imaginary parts of a and b to get the mask vector: + // [re(a)==re(b), im(a)==im(b)] + Packet2d eq = pcmp_eq(a.v, b.v); + // Swap real/imag elements in the mask in to get: + // [im(a)==im(b), re(a)==re(b)] + Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq))); + // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped + return Packet1cd(pand(eq, eq_swapped)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } + template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ - return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); -} +{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cd& from) +{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) +{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride) { Packet2d res = pset1(0.0); res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); @@ -391,17 +532,14 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Pack return Packet1cd(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) -{ - to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); -} - +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>( + std::complex* to, const Packet1cd& from, Index stride) +{ to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); - return res; } @@ -409,57 +547,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); Packet2d rev_s = preverse(s); @@ -467,9 +562,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} +{ return Packet1cd(preverse(Packet2d(x.v))); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -477,6 +570,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); kernel.packet[1].v = tmp; } + +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); +} + #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/externals/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h new file mode 100644 index 00000000..3481f337 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -0,0 +1,183 @@ +namespace Eigen { +namespace internal { + +#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. +// Here we specialize gebp_traits to eliminate these register spills. +// See #2138. +template<> +struct gebp_traits + : gebp_traits +{ + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + // This volatile inline ASM both acts as a barrier to prevent reordering, + // as well as enforces strict register use. + asm volatile( + "vmla.f32 %q[r], %q[c], %q[alpha]" + : [r] "+w" (r) + : [c] "w" (c), + [alpha] "w" (alpha) + : ); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType&) const { + acc(a, b, c); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket& b, + Packet4f& c, Packet4f& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } +}; + +#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG + +#if EIGEN_ARCH_ARM64 + +template<> +struct gebp_traits + : gebp_traits +{ + typedef float RhsPacket; + typedef float32x4_t RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest = vld1q_f32(b); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f32(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f32 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); + else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); + #else + c = vfmaq_laneq_f32(c, a, b, LaneID); + #endif + } +}; + + +template<> +struct gebp_traits + : gebp_traits +{ + typedef double RhsPacket; + + struct RhsPacketx4 { + float64x2_t B_0, B_1; + }; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = *b; + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + dest.B_0 = vld1q_f64(b); + dest.B_1 = vld1q_f64(b+2); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { + c = vfmaq_n_f64(c, a, b); + } + + // NOTE: Template parameter inference failed when compiled with Android NDK: + // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const + { madd_helper<0>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const + { madd_helper<1>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const + { madd_helper<2>(a, b, c); } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const + { madd_helper<3>(a, b, c); } + + private: + template + EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const + { + #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) + // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 + // vfmaq_laneq_f64 is implemented through a costly dup + if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); + else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); + #else + if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); + else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); + else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); + else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); + #endif + } +}; + +#endif // EIGEN_ARCH_ARM64 + +} // namespace internal +} // namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h index 6bb05bb9..fa6615a8 100644 --- a/externals/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,10 +5,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H @@ -16,74 +12,62 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - Packet4f tmp, fx; - - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - x = vminq_f32(x, p4f_exp_hi); - x = vmaxq_f32(x, p4f_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, p4f_cephes_exp_C1); - Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); - z = vmulq_f32(x, x); - y = vaddq_f32(y, p4f_cephes_exp_p1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, p4f_cephes_exp_p5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, p4f_1); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, p4i_0x7f); - mm = vshlq_n_s32(mm, 23); - Packet4f pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp(const Packet2f& x) +{ return pexp_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) +{ return pexp_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog(const Packet2f& x) +{ return plog_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) +{ return plog_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin(const Packet2f& x) +{ return psin_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) +{ return psin_float(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos(const Packet2f& x) +{ return pcos_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) +{ return pcos_float(x); } + +// Hyperbolic Tangent function. +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh(const Packet2f& x) +{ return internal::generic_fast_tanh_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) +{ return internal::generic_fast_tanh_float(x); } + +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh) + +template <> +EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) { + Packet4f fexponent; + const Packet4bf out = F32ToBf16(pfrexp(Bf16ToF32(a), fexponent)); + exponent = F32ToBf16(fexponent); + return out; +} + +template <> +EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) { + return F32ToBf16(pldexp(Bf16ToF32(a), Bf16ToF32(exponent))); } +//---------- double ---------- + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& x) +{ return pexp_double(x); } + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d plog(const Packet2d& x) +{ return plog_double(x); } + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/NEON/PacketMath.h index 84a56bdc..d2aeef43 100644 --- a/externals/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,23 +24,118 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #endif -typedef float32x2_t Packet2f; -typedef float32x4_t Packet4f; -typedef int32x4_t Packet4i; -typedef int32x2_t Packet2i; -typedef uint32x4_t Packet4ui; +#if EIGEN_COMP_MSVC_STRICT + +// In MSVC's arm_neon.h header file, all NEON vector types +// are aliases to the same underlying type __n128. +// We thus have to wrap them to make them different C++ types. +// (See also bug 1428) +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef eigen_packet_wrapper Packet8c; +typedef eigen_packet_wrapper Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef eigen_packet_wrapper Packet8uc; +typedef eigen_packet_wrapper Packet16uc; +typedef eigen_packet_wrapper Packet4s; +typedef eigen_packet_wrapper Packet8s; +typedef eigen_packet_wrapper Packet4us; +typedef eigen_packet_wrapper Packet8us; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2ui; +typedef eigen_packet_wrapper Packet4ui; +typedef eigen_packet_wrapper Packet2l; +typedef eigen_packet_wrapper Packet2ul; + +#else + +typedef float32x2_t Packet2f; +typedef float32x4_t Packet4f; +typedef eigen_packet_wrapper Packet4c; +typedef int8x8_t Packet8c; +typedef int8x16_t Packet16c; +typedef eigen_packet_wrapper Packet4uc; +typedef uint8x8_t Packet8uc; +typedef uint8x16_t Packet16uc; +typedef int16x4_t Packet4s; +typedef int16x8_t Packet8s; +typedef uint16x4_t Packet4us; +typedef uint16x8_t Packet8us; +typedef int32x2_t Packet2i; +typedef int32x4_t Packet4i; +typedef uint32x2_t Packet2ui; +typedef uint32x4_t Packet4ui; +typedef int64x2_t Packet2l; +typedef uint64x2_t Packet2ul; + +#endif // EIGEN_COMP_MSVC_STRICT + +EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ + const float* a = reinterpret_cast(&m); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + return res; +} + +// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave +// == false (i.e. shuffle(m, n, mask) equals _mm_shuffle_ps(m, n, mask)), +// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h +// to enable a shared implementation for fast inversion of matrices of size 4. +template +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +template<> +EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask) +{ + const float* a = reinterpret_cast(&m); + const float* b = reinterpret_cast(&n); + Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + return res; +} + +EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));} + +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) +{ + return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(p, q, r, s)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); +} +#define vec4f_duplane(a, p) \ + vdupq_lane_f32(vget_low_f32(a), p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -51,673 +146,4439 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !EIGEN_ARCH_ARM64 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) #endif -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4f type; - typedef Packet4f half; // Packet2f intrinsics not implemented yet +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet2f half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet16c type; + typedef Packet8c half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet16uc type; + typedef Packet8uc half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8s type; + typedef Packet4s half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet8us type; + typedef Packet4us half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasAbsDiff = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet2i half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet4ui type; + typedef Packet2ui half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2l type; + typedef Packet2l half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits +{ + typedef Packet2ul type; + typedef Packet2ul half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0 + }; +}; + +#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM +// workaround gcc 4.2, 4.3 and 4.4 compilation issue +EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +#endif + +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet2i integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef float type; + typedef Packet2f half; + typedef Packet4i integer_packet; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet4c half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int8_t type; + typedef Packet8c half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 4, + alignment = Unaligned, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet4uc half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint8_t type; + typedef Packet8uc half; + enum + { + size = 16, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false}; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int16_t type; + typedef Packet4s half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint16_t type; + typedef Packet4us half; + enum + { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int32_t type; + typedef Packet2i half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint32_t type; + typedef Packet2ui half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef int64_t type; + typedef Packet2l half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; +template<> struct unpacket_traits +{ + typedef uint64_t type; + typedef Packet2ul half; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2f pset1(const float& from) { return vdup_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pset1(const int8_t& from) +{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pset1(const int8_t& from) { return vdup_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { return vdupq_n_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pset1(const uint8_t& from) +{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc pset1(const uint8_t& from) { return vdup_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { return vdupq_n_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pset1(const int16_t& from) { return vdup_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pset1(const int16_t& from) { return vdupq_n_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pset1(const uint16_t& from) { return vdup_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pset1(const uint16_t& from) { return vdupq_n_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pset1(const int32_t& from) { return vdup_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pset1(const uint32_t& from) { return vdup_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) { return vdupq_n_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { return vdupq_n_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { return vdupq_n_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(unsigned int from) +{ return vreinterpret_f32_u32(vdup_n_u32(from)); } +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) +{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); } + +template<> EIGEN_STRONG_INLINE Packet2f plset(const float& a) +{ + const float c[] = {0.0f,1.0f}; + return vadd_f32(pset1(a), vld1_f32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) +{ + const float c[] = {0.0f,1.0f,2.0f,3.0f}; + return vaddq_f32(pset1(a), vld1q_f32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4c plset(const int8_t& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_s8(pset1(a), vld1_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) +{ + const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_s8(pset1(a), vld1q_s8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4uc plset(const uint8_t& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7}; + return vadd_u8(pset1(a), vld1_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) +{ + const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + return vaddq_u8(pset1(a), vld1q_u8(c)); +} +template<> EIGEN_STRONG_INLINE Packet4s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3}; + return vadd_s16(pset1(a), vld1_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet4us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3}; + return vadd_u16(pset1(a), vld1_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8s plset(const int16_t& a) +{ + const int16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_s16(pset1(a), vld1q_s16(c)); +} +template<> EIGEN_STRONG_INLINE Packet8us plset(const uint16_t& a) +{ + const uint16_t c[] = {0,1,2,3,4,5,6,7}; + return vaddq_u16(pset1(a), vld1q_u16(c)); +} +template<> EIGEN_STRONG_INLINE Packet2i plset(const int32_t& a) +{ + const int32_t c[] = {0,1}; + return vadd_s32(pset1(a), vld1_s32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) +{ + const int32_t c[] = {0,1,2,3}; + return vaddq_s32(pset1(a), vld1q_s32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1}; + return vadd_u32(pset1(a), vld1_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet4ui plset(const uint32_t& a) +{ + const uint32_t c[] = {0,1,2,3}; + return vaddq_u32(pset1(a), vld1q_u32(c)); +} +template<> EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) +{ + const int64_t c[] = {0,1}; + return vaddq_s64(pset1(a), vld1q_s64(c)); +} +template<> EIGEN_STRONG_INLINE Packet2ul plset(const uint64_t& a) +{ + const uint64_t c[] = {0,1}; + return vaddq_u64(pset1(a), vld1q_u64(c)); +} + +template<> EIGEN_STRONG_INLINE Packet2f padd(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c padd(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c padd(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c padd(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc padd(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc padd(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s padd(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us padd(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us padd(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i padd(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui padd(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui padd(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul padd(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f psub(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c psub(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vsub_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c psub(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c psub(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc psub(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vsub_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc psub(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s psub(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us psub(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us psub(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i psub(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui psub(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); +template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { + Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { + Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + return padd(a, pxor(mask, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vnegq_s64(a); +#else + return vcombine_s64( + vdup_n_s64(-vgetq_lane_s64(a, 0)), + vdup_n_s64(-vgetq_lane_s64(a, 1))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2f pmul(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pmul(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmul_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmul(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmul(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmul(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmul_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmul(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmul(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmul(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmul(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmul(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmul(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmul(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmul(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)), + vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmul(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)), + vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdiv_f32(a,b); +#else + Packet2f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpe_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecps_f32(b, inv); + inv = vmul_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmul_f32(a, inv); + + return div; +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdivq_f32(a,b); +#else + Packet4f inv, restep, div; + + // NEON does not offer a divide instruction, we have to do a reciprocal approximation + // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers + // a reciprocal estimate AND a reciprocal step -which saves a few instructions + // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with + // Newton-Raphson and vrecpsq_f32() + inv = vrecpeq_f32(b); + + // This returns a differential, by which we will have to multiply inv to get a better + // approximation of 1/b. + restep = vrecpsq_f32(b, inv); + inv = vmulq_f32(restep, inv); + + // Finally, multiply a by 1/b and get the wanted result of the division. + div = vmulq_f32(a, inv); + + return div; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4c pdiv(const Packet4c& /*a*/, const Packet4c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8c pdiv(const Packet8c& /*a*/, const Packet8c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16c pdiv(const Packet16c& /*a*/, const Packet16c& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4uc pdiv(const Packet4uc& /*a*/, const Packet4uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pdiv(const Packet8uc& /*a*/, const Packet8uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet16uc pdiv(const Packet16uc& /*a*/, const Packet16uc& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4s pdiv(const Packet4s& /*a*/, const Packet4s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8s pdiv(const Packet8s& /*a*/, const Packet8s& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4us pdiv(const Packet4us& /*a*/, const Packet4us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet8us pdiv(const Packet8us& /*a*/, const Packet8us& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2i pdiv(const Packet2i& /*a*/, const Packet2i& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2ui pdiv(const Packet2ui& /*a*/, const Packet2ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4ui pdiv(const Packet4ui& /*a*/, const Packet4ui& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet2l pdiv(const Packet2l& /*a*/, const Packet2l& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0LL); +} +template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, const Packet2ul& /*b*/) +{ + eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0ULL); +} + + +#ifdef __ARM_FEATURE_FMA +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ return vfmaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ return vfma_f32(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + return vmlaq_f32(c,a,b); +} +template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) +{ + return vmla_f32(c,a,b); +} +#endif + +// No FMA instruction for int, so use MLA unconditionally. +template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmla_s8( + vreinterpret_s8_s32(vdup_n_s32(c)), + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) +{ return vmla_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) +{ return vmlaq_s8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmla_u8( + vreinterpret_u8_u32(vdup_n_u32(c)), + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) +{ return vmla_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) +{ return vmlaq_u8(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) +{ return vmla_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) +{ return vmlaq_s16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) +{ return vmla_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) +{ return vmlaq_u16(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) +{ return vmla_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) +{ return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) +{ return vmla_u32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) +{ return vmlaq_u32(c,a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pabsdiff(const Packet2f& a, const Packet2f& b) +{ return vabd_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) +{ return vabdq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4c pabsdiff(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vabd_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pabsdiff(const Packet8c& a, const Packet8c& b) +{ return vabd_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) +{ return vabdq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vabd_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff(const Packet8uc& a, const Packet8uc& b) +{ return vabd_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) +{ return vabdq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pabsdiff(const Packet4s& a, const Packet4s& b) +{ return vabd_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) +{ return vabdq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pabsdiff(const Packet4us& a, const Packet4us& b) +{ return vabd_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) +{ return vabdq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pabsdiff(const Packet2i& a, const Packet2i& b) +{ return vabd_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) +{ return vabdq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff(const Packet2ui& a, const Packet2ui& b) +{ return vabd_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) +{ return vabdq_u32(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmin(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmin_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmin(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmin(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmin_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmin(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmin(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmin(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmin(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmin(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmin(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmin(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); } +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return pmax(a, b); } + +template<> EIGEN_STRONG_INLINE Packet4c pmax(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_s8(vmax_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pmax(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pmax(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vmax_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pmax(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pmax(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pmax(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pmax(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pmax(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pmax(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { + return vcombine_s64( + vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), + vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); +} +template<> EIGEN_STRONG_INLINE Packet2ul pmax(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u64( + vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), + vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_le(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vcle_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_le(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vcle_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_le(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vcle_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vcle_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le(const Packet8uc& a, const Packet8uc& b) +{ return vcle_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) +{ return vcleq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_le(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vcle_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_le(const Packet4us& a, const Packet4us& b) +{ return vcle_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) +{ return vcleq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_le(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vcle_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le(const Packet2ui& a, const Packet2ui& b) +{ return vcle_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) +{ return vcleq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcleq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcleq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vclt_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vclt_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vclt_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vclt_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt(const Packet8uc& a, const Packet8uc& b) +{ return vclt_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) +{ return vcltq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vclt_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt(const Packet4us& a, const Packet4us& b) +{ return vclt_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) +{ return vcltq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vclt_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt(const Packet2ui& a, const Packet2ui& b) +{ return vclt_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) +{ return vcltq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vcltq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vcltq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vceq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq(const Packet4c& a, const Packet4c& b) +{ + return vget_lane_s32(vreinterpret_s32_u8(vceq_s8( + vreinterpret_s8_s32(vdup_n_s32(a)), + vreinterpret_s8_s32(vdup_n_s32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq(const Packet8c& a, const Packet8c& b) +{ return vreinterpret_s8_u8(vceq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) +{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq(const Packet4uc& a, const Packet4uc& b) +{ + return vget_lane_u32(vreinterpret_u32_u8(vceq_u8( + vreinterpret_u8_u32(vdup_n_u32(a)), + vreinterpret_u8_u32(vdup_n_u32(b)))), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq(const Packet8uc& a, const Packet8uc& b) +{ return vceq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) +{ return vceqq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq(const Packet4s& a, const Packet4s& b) +{ return vreinterpret_s16_u16(vceq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) +{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq(const Packet4us& a, const Packet4us& b) +{ return vceq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) +{ return vceqq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq(const Packet2i& a, const Packet2i& b) +{ return vreinterpret_s32_u32(vceq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) +{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq(const Packet2ui& a, const Packet2ui& b) +{ return vceq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) +{ return vceqq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) +{ +#if EIGEN_ARCH_ARM64 + return vreinterpretq_s64_u64(vceqq_s64(a,b)); +#else + return vcombine_s64( + vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0), + vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq(const Packet2ul& a, const Packet2ul& b) +{ +#if EIGEN_ARCH_ARM64 + return vceqq_u64(a,b); +#else + return vcombine_u64( + vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0), + vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2f pand(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pand(const Packet4c& a, const Packet4c& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8c pand(const Packet8c& a, const Packet8c& b) +{ return vand_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pand(const Packet16c& a, const Packet16c& b) +{ return vandq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pand(const Packet4uc& a, const Packet4uc& b) +{ return a & b; } +template<> EIGEN_STRONG_INLINE Packet8uc pand(const Packet8uc& a, const Packet8uc& b) +{ return vand_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pand(const Packet16uc& a, const Packet16uc& b) +{ return vandq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pand(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pand(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pand(const Packet4us& a, const Packet4us& b) +{ return vand_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) +{ return vandq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pand(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pand(const Packet2ui& a, const Packet2ui& b) +{ return vand_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) +{ return vandq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pand(const Packet2ul& a, const Packet2ul& b) +{ return vandq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f por(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c por(const Packet4c& a, const Packet4c& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8c por(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c por(const Packet16c& a, const Packet16c& b) +{ return vorrq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc por(const Packet4uc& a, const Packet4uc& b) +{ return a | b; } +template<> EIGEN_STRONG_INLINE Packet8uc por(const Packet8uc& a, const Packet8uc& b) +{ return vorr_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc por(const Packet16uc& a, const Packet16uc& b) +{ return vorrq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s por(const Packet4s& a, const Packet4s& b) +{ return vorr_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) +{ return vorrq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us por(const Packet4us& a, const Packet4us& b) +{ return vorr_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) +{ return vorrq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i por(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui por(const Packet2ui& a, const Packet2ui& b) +{ return vorr_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui por(const Packet4ui& a, const Packet4ui& b) +{ return vorrq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) +{ return vorrq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul por(const Packet2ul& a, const Packet2ul& b) +{ return vorrq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pxor(const Packet4c& a, const Packet4c& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8c pxor(const Packet8c& a, const Packet8c& b) +{ return veor_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pxor(const Packet16c& a, const Packet16c& b) +{ return veorq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pxor(const Packet4uc& a, const Packet4uc& b) +{ return a ^ b; } +template<> EIGEN_STRONG_INLINE Packet8uc pxor(const Packet8uc& a, const Packet8uc& b) +{ return veor_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pxor(const Packet16uc& a, const Packet16uc& b) +{ return veorq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pxor(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pxor(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pxor(const Packet4us& a, const Packet4us& b) +{ return veor_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pxor(const Packet8us& a, const Packet8us& b) +{ return veorq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pxor(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pxor(const Packet2ui& a, const Packet2ui& b) +{ return veor_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pxor(const Packet4ui& a, const Packet4ui& b) +{ return veorq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) +{ return veorq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pxor(const Packet2ul& a, const Packet2ul& b) +{ return veorq_u64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2f pandnot(const Packet2f& a, const Packet2f& b) +{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } +template<> EIGEN_STRONG_INLINE Packet4c pandnot(const Packet4c& a, const Packet4c& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8c pandnot(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16c pandnot(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4uc pandnot(const Packet4uc& a, const Packet4uc& b) +{ return a & ~b; } +template<> EIGEN_STRONG_INLINE Packet8uc pandnot(const Packet8uc& a, const Packet8uc& b) +{ return vbic_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet16uc pandnot(const Packet16uc& a, const Packet16uc& b) +{ return vbicq_u8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4s pandnot(const Packet4s& a, const Packet4s& b) +{ return vbic_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8s pandnot(const Packet8s& a, const Packet8s& b) +{ return vbicq_s16(a,b); } +template<> EIGEN_STRONG_INLINE Packet4us pandnot(const Packet4us& a, const Packet4us& b) +{ return vbic_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet8us pandnot(const Packet8us& a, const Packet8us& b) +{ return vbicq_u16(a,b); } +template<> EIGEN_STRONG_INLINE Packet2i pandnot(const Packet2i& a, const Packet2i& b) +{ return vbic_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) +{ return vbicq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ui pandnot(const Packet2ui& a, const Packet2ui& b) +{ return vbic_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4ui pandnot(const Packet4ui& a, const Packet4ui& b) +{ return vbicq_u32(a,b); } +template<> EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) +{ return vbicq_s64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) +{ return vbicq_u64(a,b); } + + +template EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) +{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) +{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) +{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) +{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) +{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) +{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) +{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } + +template EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); } +template EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } +template EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); } +template EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); } +template EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); } +template EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); } +template EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); } +template EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); } +template EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); } + +template<> EIGEN_STRONG_INLINE Packet2f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) +{ + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadu(const int8_t* from) +{ + Packet4c res; + memcpy(&res, from, sizeof(Packet4c)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); } +template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); } +template<> EIGEN_STRONG_INLINE Packet4uc ploadu(const uint8_t* from) +{ + Packet4uc res; + memcpy(&res, from, sizeof(Packet4uc)); + return res; +} +template<> EIGEN_STRONG_INLINE Packet8uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); } +template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); } +template<> EIGEN_STRONG_INLINE Packet4s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); } +template<> EIGEN_STRONG_INLINE Packet8s ploadu(const int16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); } +template<> EIGEN_STRONG_INLINE Packet4us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); } +template<> EIGEN_STRONG_INLINE Packet8us ploadu(const uint16_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); } +template<> EIGEN_STRONG_INLINE Packet2i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet2ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadu(const uint32_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); } +template<> EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploadu(const uint64_t* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet2f ploaddup(const float* from) +{ return vld1_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4c ploaddup(const int8_t* from) +{ + const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload(from))); + return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0); +} +template<> EIGEN_STRONG_INLINE Packet8c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + return vzip_s8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) +{ + const int8x8_t a = vld1_s8(from); + const int8x8x2_t b = vzip_s8(a,a); + return vcombine_s8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload(from))); + return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0); +} +template<> EIGEN_STRONG_INLINE Packet8uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + return vzip_u8(a,a).val[0]; +} +template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) +{ + const uint8x8_t a = vld1_u8(from); + const uint8x8x2_t b = vzip_u8(a,a); + return vcombine_u8(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4s ploaddup(const int16_t* from) +{ + return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), + vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const int16_t* from) +{ + const int16x4_t a = vld1_s16(from); + const int16x4x2_t b = vzip_s16(a,a); + return vcombine_s16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet4us ploaddup(const uint16_t* from) +{ + return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), + vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const uint16_t* from) +{ + const uint16x4_t a = vld1_u16(from); + const uint16x4x2_t b = vzip_u16(a,a); + return vcombine_u16(b.val[0], b.val[1]); +} +template<> EIGEN_STRONG_INLINE Packet2i ploaddup(const int32_t* from) +{ return vld1_dup_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) +{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2ui ploaddup(const uint32_t* from) +{ return vld1_dup_u32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploaddup(const uint32_t* from) +{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); } +template<> EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) +{ return vld1q_dup_s64(from); } +template<> EIGEN_STRONG_INLINE Packet2ul ploaddup(const uint64_t* from) +{ return vld1q_dup_u64(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4c ploadquad(const int8_t* from) +{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8c ploadquad(const int8_t* from) +{ + return vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) +{ + const int8x8_t a = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from)), + vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); + const int8x8_t b = vreinterpret_s8_u32(vzip_u32( + vreinterpret_u32_s8(vld1_dup_s8(from+2)), + vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]); + return vcombine_s8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet4uc ploadquad(const uint8_t* from) +{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc ploadquad(const uint8_t* from) +{ + return vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); +} +template<> EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) +{ + const uint8x8_t a = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from)), + vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); + const uint8x8_t b = vreinterpret_u8_u32(vzip_u32( + vreinterpret_u32_u8(vld1_dup_u8(from+2)), + vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]); + return vcombine_u8(a,b); +} +template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const int16_t* from) +{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const uint16_t* from) +{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); } +template<> EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { return vld1q_dup_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { return vld1q_dup_u32(from); } + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet4c& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet8c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet4uc& from) +{ memcpy(to, &from, sizeof(from)); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet8uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet4s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet8s& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet4us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet8us& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet2i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet2ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet4ui& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(uint64_t* to, const Packet2ul& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather(const float* from, Index stride) +{ + Packet2f res = vld1_dup_f32(from); + res = vld1_lane_f32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather(const float* from, Index stride) +{ + Packet4f res = vld1q_dup_f32(from); + res = vld1q_lane_f32(from + 1*stride, res, 1); + res = vld1q_lane_f32(from + 2*stride, res, 2); + res = vld1q_lane_f32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather(const int8_t* from, Index stride) +{ + Packet4c res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather(const int8_t* from, Index stride) +{ + Packet8c res = vld1_dup_s8(from); + res = vld1_lane_s8(from + 1*stride, res, 1); + res = vld1_lane_s8(from + 2*stride, res, 2); + res = vld1_lane_s8(from + 3*stride, res, 3); + res = vld1_lane_s8(from + 4*stride, res, 4); + res = vld1_lane_s8(from + 5*stride, res, 5); + res = vld1_lane_s8(from + 6*stride, res, 6); + res = vld1_lane_s8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather(const int8_t* from, Index stride) +{ + Packet16c res = vld1q_dup_s8(from); + res = vld1q_lane_s8(from + 1*stride, res, 1); + res = vld1q_lane_s8(from + 2*stride, res, 2); + res = vld1q_lane_s8(from + 3*stride, res, 3); + res = vld1q_lane_s8(from + 4*stride, res, 4); + res = vld1q_lane_s8(from + 5*stride, res, 5); + res = vld1q_lane_s8(from + 6*stride, res, 6); + res = vld1q_lane_s8(from + 7*stride, res, 7); + res = vld1q_lane_s8(from + 8*stride, res, 8); + res = vld1q_lane_s8(from + 9*stride, res, 9); + res = vld1q_lane_s8(from + 10*stride, res, 10); + res = vld1q_lane_s8(from + 11*stride, res, 11); + res = vld1q_lane_s8(from + 12*stride, res, 12); + res = vld1q_lane_s8(from + 13*stride, res, 13); + res = vld1q_lane_s8(from + 14*stride, res, 14); + res = vld1q_lane_s8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather(const uint8_t* from, Index stride) +{ + Packet4uc res; + for (int i = 0; i != 4; i++) + reinterpret_cast(&res)[i] = *(from + i * stride); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather(const uint8_t* from, Index stride) +{ + Packet8uc res = vld1_dup_u8(from); + res = vld1_lane_u8(from + 1*stride, res, 1); + res = vld1_lane_u8(from + 2*stride, res, 2); + res = vld1_lane_u8(from + 3*stride, res, 3); + res = vld1_lane_u8(from + 4*stride, res, 4); + res = vld1_lane_u8(from + 5*stride, res, 5); + res = vld1_lane_u8(from + 6*stride, res, 6); + res = vld1_lane_u8(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather(const uint8_t* from, Index stride) +{ + Packet16uc res = vld1q_dup_u8(from); + res = vld1q_lane_u8(from + 1*stride, res, 1); + res = vld1q_lane_u8(from + 2*stride, res, 2); + res = vld1q_lane_u8(from + 3*stride, res, 3); + res = vld1q_lane_u8(from + 4*stride, res, 4); + res = vld1q_lane_u8(from + 5*stride, res, 5); + res = vld1q_lane_u8(from + 6*stride, res, 6); + res = vld1q_lane_u8(from + 7*stride, res, 7); + res = vld1q_lane_u8(from + 8*stride, res, 8); + res = vld1q_lane_u8(from + 9*stride, res, 9); + res = vld1q_lane_u8(from + 10*stride, res, 10); + res = vld1q_lane_u8(from + 11*stride, res, 11); + res = vld1q_lane_u8(from + 12*stride, res, 12); + res = vld1q_lane_u8(from + 13*stride, res, 13); + res = vld1q_lane_u8(from + 14*stride, res, 14); + res = vld1q_lane_u8(from + 15*stride, res, 15); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather(const int16_t* from, Index stride) +{ + Packet4s res = vld1_dup_s16(from); + res = vld1_lane_s16(from + 1*stride, res, 1); + res = vld1_lane_s16(from + 2*stride, res, 2); + res = vld1_lane_s16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather(const int16_t* from, Index stride) +{ + Packet8s res = vld1q_dup_s16(from); + res = vld1q_lane_s16(from + 1*stride, res, 1); + res = vld1q_lane_s16(from + 2*stride, res, 2); + res = vld1q_lane_s16(from + 3*stride, res, 3); + res = vld1q_lane_s16(from + 4*stride, res, 4); + res = vld1q_lane_s16(from + 5*stride, res, 5); + res = vld1q_lane_s16(from + 6*stride, res, 6); + res = vld1q_lane_s16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather(const uint16_t* from, Index stride) +{ + Packet4us res = vld1_dup_u16(from); + res = vld1_lane_u16(from + 1*stride, res, 1); + res = vld1_lane_u16(from + 2*stride, res, 2); + res = vld1_lane_u16(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather(const uint16_t* from, Index stride) +{ + Packet8us res = vld1q_dup_u16(from); + res = vld1q_lane_u16(from + 1*stride, res, 1); + res = vld1q_lane_u16(from + 2*stride, res, 2); + res = vld1q_lane_u16(from + 3*stride, res, 3); + res = vld1q_lane_u16(from + 4*stride, res, 4); + res = vld1q_lane_u16(from + 5*stride, res, 5); + res = vld1q_lane_u16(from + 6*stride, res, 6); + res = vld1q_lane_u16(from + 7*stride, res, 7); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather(const int32_t* from, Index stride) +{ + Packet2i res = vld1_dup_s32(from); + res = vld1_lane_s32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather(const int32_t* from, Index stride) +{ + Packet4i res = vld1q_dup_s32(from); + res = vld1q_lane_s32(from + 1*stride, res, 1); + res = vld1q_lane_s32(from + 2*stride, res, 2); + res = vld1q_lane_s32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather(const uint32_t* from, Index stride) +{ + Packet2ui res = vld1_dup_u32(from); + res = vld1_lane_u32(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather(const uint32_t* from, Index stride) +{ + Packet4ui res = vld1q_dup_u32(from); + res = vld1q_lane_u32(from + 1*stride, res, 1); + res = vld1q_lane_u32(from + 2*stride, res, 2); + res = vld1q_lane_u32(from + 3*stride, res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather(const int64_t* from, Index stride) +{ + Packet2l res = vld1q_dup_s64(from); + res = vld1q_lane_s64(from + 1*stride, res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather(const uint64_t* from, Index stride) +{ + Packet2ul res = vld1q_dup_u64(from); + res = vld1q_lane_u64(from + 1*stride, res, 1); + return res; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet2f& from, Index stride) +{ + vst1_lane_f32(to + stride*0, from, 0); + vst1_lane_f32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(float* to, const Packet4f& from, Index stride) +{ + vst1q_lane_f32(to + stride*0, from, 0); + vst1q_lane_f32(to + stride*1, from, 1); + vst1q_lane_f32(to + stride*2, from, 2); + vst1q_lane_f32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet4c& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet8c& from, Index stride) +{ + vst1_lane_s8(to + stride*0, from, 0); + vst1_lane_s8(to + stride*1, from, 1); + vst1_lane_s8(to + stride*2, from, 2); + vst1_lane_s8(to + stride*3, from, 3); + vst1_lane_s8(to + stride*4, from, 4); + vst1_lane_s8(to + stride*5, from, 5); + vst1_lane_s8(to + stride*6, from, 6); + vst1_lane_s8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int8_t* to, const Packet16c& from, Index stride) +{ + vst1q_lane_s8(to + stride*0, from, 0); + vst1q_lane_s8(to + stride*1, from, 1); + vst1q_lane_s8(to + stride*2, from, 2); + vst1q_lane_s8(to + stride*3, from, 3); + vst1q_lane_s8(to + stride*4, from, 4); + vst1q_lane_s8(to + stride*5, from, 5); + vst1q_lane_s8(to + stride*6, from, 6); + vst1q_lane_s8(to + stride*7, from, 7); + vst1q_lane_s8(to + stride*8, from, 8); + vst1q_lane_s8(to + stride*9, from, 9); + vst1q_lane_s8(to + stride*10, from, 10); + vst1q_lane_s8(to + stride*11, from, 11); + vst1q_lane_s8(to + stride*12, from, 12); + vst1q_lane_s8(to + stride*13, from, 13); + vst1q_lane_s8(to + stride*14, from, 14); + vst1q_lane_s8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet4uc& from, Index stride) +{ + for (int i = 0; i != 4; i++) + *(to + i * stride) = reinterpret_cast(&from)[i]; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet8uc& from, Index stride) +{ + vst1_lane_u8(to + stride*0, from, 0); + vst1_lane_u8(to + stride*1, from, 1); + vst1_lane_u8(to + stride*2, from, 2); + vst1_lane_u8(to + stride*3, from, 3); + vst1_lane_u8(to + stride*4, from, 4); + vst1_lane_u8(to + stride*5, from, 5); + vst1_lane_u8(to + stride*6, from, 6); + vst1_lane_u8(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint8_t* to, const Packet16uc& from, Index stride) +{ + vst1q_lane_u8(to + stride*0, from, 0); + vst1q_lane_u8(to + stride*1, from, 1); + vst1q_lane_u8(to + stride*2, from, 2); + vst1q_lane_u8(to + stride*3, from, 3); + vst1q_lane_u8(to + stride*4, from, 4); + vst1q_lane_u8(to + stride*5, from, 5); + vst1q_lane_u8(to + stride*6, from, 6); + vst1q_lane_u8(to + stride*7, from, 7); + vst1q_lane_u8(to + stride*8, from, 8); + vst1q_lane_u8(to + stride*9, from, 9); + vst1q_lane_u8(to + stride*10, from, 10); + vst1q_lane_u8(to + stride*11, from, 11); + vst1q_lane_u8(to + stride*12, from, 12); + vst1q_lane_u8(to + stride*13, from, 13); + vst1q_lane_u8(to + stride*14, from, 14); + vst1q_lane_u8(to + stride*15, from, 15); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet4s& from, Index stride) +{ + vst1_lane_s16(to + stride*0, from, 0); + vst1_lane_s16(to + stride*1, from, 1); + vst1_lane_s16(to + stride*2, from, 2); + vst1_lane_s16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int16_t* to, const Packet8s& from, Index stride) +{ + vst1q_lane_s16(to + stride*0, from, 0); + vst1q_lane_s16(to + stride*1, from, 1); + vst1q_lane_s16(to + stride*2, from, 2); + vst1q_lane_s16(to + stride*3, from, 3); + vst1q_lane_s16(to + stride*4, from, 4); + vst1q_lane_s16(to + stride*5, from, 5); + vst1q_lane_s16(to + stride*6, from, 6); + vst1q_lane_s16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet4us& from, Index stride) +{ + vst1_lane_u16(to + stride*0, from, 0); + vst1_lane_u16(to + stride*1, from, 1); + vst1_lane_u16(to + stride*2, from, 2); + vst1_lane_u16(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint16_t* to, const Packet8us& from, Index stride) +{ + vst1q_lane_u16(to + stride*0, from, 0); + vst1q_lane_u16(to + stride*1, from, 1); + vst1q_lane_u16(to + stride*2, from, 2); + vst1q_lane_u16(to + stride*3, from, 3); + vst1q_lane_u16(to + stride*4, from, 4); + vst1q_lane_u16(to + stride*5, from, 5); + vst1q_lane_u16(to + stride*6, from, 6); + vst1q_lane_u16(to + stride*7, from, 7); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet2i& from, Index stride) +{ + vst1_lane_s32(to + stride*0, from, 0); + vst1_lane_s32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int32_t* to, const Packet4i& from, Index stride) +{ + vst1q_lane_s32(to + stride*0, from, 0); + vst1q_lane_s32(to + stride*1, from, 1); + vst1q_lane_s32(to + stride*2, from, 2); + vst1q_lane_s32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet2ui& from, Index stride) +{ + vst1_lane_u32(to + stride*0, from, 0); + vst1_lane_u32(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint32_t* to, const Packet4ui& from, Index stride) +{ + vst1q_lane_u32(to + stride*0, from, 0); + vst1q_lane_u32(to + stride*1, from, 1); + vst1q_lane_u32(to + stride*2, from, 2); + vst1q_lane_u32(to + stride*3, from, 3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(int64_t* to, const Packet2l& from, Index stride) +{ + vst1q_lane_s64(to + stride*0, from, 0); + vst1q_lane_s64(to + stride*1, from, 1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(uint64_t* to, const Packet2ul& from, Index stride) +{ + vst1q_lane_u64(to + stride*0, from, 0); + vst1q_lane_u64(to + stride*1, from, 1); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet2f& a) { return vget_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return vgetq_lane_f32(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet4c& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet8c& a) { return vget_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { return vgetq_lane_s8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet4uc& a) { return static_cast(a & 0xff); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet8uc& a) { return vget_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { return vgetq_lane_u8(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet4s& a) { return vget_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet8s& a) { return vgetq_lane_s16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet4us& a) { return vget_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet8us& a) { return vgetq_lane_u16(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet2i& a) { return vget_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { return vgetq_lane_s32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet2ui& a) { return vget_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { return vgetq_lane_u32(a,0); } +template<> EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { return vgetq_lane_s64(a,0); } +template<> EIGEN_STRONG_INLINE uint64_t pfirst(const Packet2ul& a) { return vgetq_lane_u64(a,0); } + +template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + const float32x4_t a_r64 = vrev64q_f32(a); + return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) +{ + const int8x16_t a_r64 = vrev64q_s8(a); + return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) +{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); } +template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) +{ + const uint8x16_t a_r64 = vrev64q_u8(a); + return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) +{ + const int16x8_t a_r64 = vrev64q_s16(a); + return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); } +template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) +{ + const uint16x8_t a_r64 = vrev64q_u16(a); + return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + const int32x4_t a_r64 = vrev64q_s32(a); + return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); } +template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) +{ + const uint32x4_t a_r64 = vrev64q_u32(a); + return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64)); +} +template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) +{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); } +template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) +{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4c pabs(const Packet4c& a) +{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } +template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); } +template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); } +template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); } +template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); } +template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { +#if EIGEN_ARCH_ARM64 + return vabsq_s64(a); +#else + return vcombine_s64( + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), + vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1)))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2f pfrexp(const Packet2f& a, Packet2f& exponent) +{ return pfrexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2f pldexp(const Packet2f& a, const Packet2f& exponent) +{ return pldexp_generic(a,exponent); } +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) +{ return pldexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpadd_f32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t sum = vpadd_s8(a_dup, a_dup); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) +{ + int8x8_t sum = vpadd_s8(a,a); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) +{ + int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a)); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + sum = vpadd_s8(sum, sum); + return vget_lane_s8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t sum = vpadd_u8(a_dup, a_dup); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) +{ + uint8x8_t sum = vpadd_u8(a,a); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) +{ + uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a)); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + sum = vpadd_u8(sum, sum); + return vget_lane_u8(sum, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet4s& a) +{ + const int16x4_t sum = vpadd_s16(a,a); + return vget_lane_s16(vpadd_s16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) +{ + int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a)); + sum = vpadd_s16(sum, sum); + sum = vpadd_s16(sum, sum); + return vget_lane_s16(sum, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet4us& a) +{ + const uint16x4_t sum = vpadd_u16(a,a); + return vget_lane_u16(vpadd_u16(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) +{ + uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a)); + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + return vget_lane_u16(sum, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) +{ + const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpadd_s32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) +{ + const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpadd_u32(sum, sum), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) +{ + return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, + vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) +{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) +{ + return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, + vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) +{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) +{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) +{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); } + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) +{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) +{ + int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); + prod = vmul_s8(prod, vrev16_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2); +} +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) +{ + int8x8_t prod = vmul_s8(a, vrev16_s8(a)); + prod = vmul_s8(prod, vrev32_s8(prod)); + return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); +} +template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) +{ + uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); + prod = vmul_u8(prod, vrev16_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) +{ + uint8x8_t prod = vmul_u8(a, vrev16_u8(a)); + prod = vmul_u8(prod, vrev32_u8(prod)); + return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) +{ + const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet8s& a) +{ + int16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_s16(vget_low_s16(a), vget_high_s16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_s16(prod, vrev32_s16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet4us& a) +{ + const uint16x4_t prod = vmul_u16(a, vrev32_u16(a)); + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a) +{ + uint16x4_t prod; + + // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| + prod = vmul_u16(vget_low_u16(a), vget_high_u16(a)); + // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| + prod = vmul_u16(prod, vrev32_u16(prod)); + // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| + return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); +} +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) +{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) +{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } +template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) +{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } +template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) +{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); } + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) +{ return vget_lane_f32(vpmin_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmin_f32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t min = vpmin_s8(a_dup, a_dup); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) +{ + int8x8_t min = vpmin_s8(a,a); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) +{ + int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a)); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + min = vpmin_s8(min, min); + return vget_lane_s8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t min = vpmin_u8(a_dup, a_dup); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) +{ + uint8x8_t min = vpmin_u8(a,a); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) +{ + uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a)); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + min = vpmin_u8(min, min); + return vget_lane_u8(min, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet4s& a) +{ + const int16x4_t min = vpmin_s16(a,a); + return vget_lane_s16(vpmin_s16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) +{ + int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a)); + min = vpmin_s16(min, min); + min = vpmin_s16(min, min); + return vget_lane_s16(min, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet4us& a) +{ + const uint16x4_t min = vpmin_u16(a,a); + return vget_lane_u16(vpmin_u16(min, min), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) +{ + uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a)); + min = vpmin_u16(min, min); + min = vpmin_u16(min, min); + return vget_lane_u16(min, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet2i& a) +{ return vget_lane_s32(vpmin_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) +{ + const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmin_s32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet2ui& a) +{ return vget_lane_u32(vpmin_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) +{ + const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmin_u32(min, min), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) +{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) +{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) +{ return vget_lane_f32(vpmax_f32(a,a), 0); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); + return vget_lane_f32(vpmax_f32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) +{ + const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); + int8x8_t max = vpmax_s8(a_dup, a_dup); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) +{ + int8x8_t max = vpmax_s8(a,a); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) +{ + int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a)); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + max = vpmax_s8(max, max); + return vget_lane_s8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) +{ + const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t max = vpmax_u8(a_dup, a_dup); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) +{ + uint8x8_t max = vpmax_u8(a,a); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) +{ + uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a)); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + max = vpmax_u8(max, max); + return vget_lane_u8(max, 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet4s& a) +{ + const int16x4_t max = vpmax_s16(a,a); + return vget_lane_s16(vpmax_s16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) +{ + int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a)); + max = vpmax_s16(max, max); + max = vpmax_s16(max, max); + return vget_lane_s16(max, 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet4us& a) +{ + const uint16x4_t max = vpmax_u16(a,a); + return vget_lane_u16(vpmax_u16(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) +{ + uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a)); + max = vpmax_u16(max, max); + max = vpmax_u16(max, max); + return vget_lane_u16(max, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet2i& a) +{ return vget_lane_s32(vpmax_s32(a,a), 0); } +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) +{ + const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a)); + return vget_lane_s32(vpmax_s32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet2ui& a) +{ return vget_lane_u32(vpmax_u32(a,a), 0); } +template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) +{ + const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a)); + return vget_lane_u32(vpmax_u32(max, max), 0); +} +template<> EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) +{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } +template<> EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) +{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } + +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +{ + uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), + vget_high_u32(vreinterpretq_u32_f32(x))); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); +} + +// Helpers for ptranspose. +namespace detail { + +template +void zip_in_place(Packet& p1, Packet& p2); + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2f& p1, Packet2f& p2) { + const float32x2x2_t tmp = vzip_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4f& p1, Packet4f& p2) { + const float32x4x2_t tmp = vzipq_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8c& p1, Packet8c& p2) { + const int8x8x2_t tmp = vzip_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16c& p1, Packet16c& p2) { + const int8x16x2_t tmp = vzipq_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8uc& p1, Packet8uc& p2) { + const uint8x8x2_t tmp = vzip_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16uc& p1, Packet16uc& p2) { + const uint8x16x2_t tmp = vzipq_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2i& p1, Packet2i& p2) { + const int32x2x2_t tmp = vzip_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4i& p1, Packet4i& p2) { + const int32x4x2_t tmp = vzipq_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2ui& p1, Packet2ui& p2) { + const uint32x2x2_t tmp = vzip_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4ui& p1, Packet4ui& p2) { + const uint32x4x2_t tmp = vzipq_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4s& p1, Packet4s& p2) { + const int16x4x2_t tmp = vzip_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8s& p1, Packet8s& p2) { + const int16x8x2_t tmp = vzipq_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4us& p1, Packet4us& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8us& p1, Packet8us& p2) { + const uint16x8x2_t tmp = vzipq_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} + +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + EIGEN_UNROLL_LOOP + for (int i=0; i<4; ++i) { + const int m = (1 << i); + EIGEN_UNROLL_LOOP + for (int j=0; j& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); + const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); + + const int8x8x2_t zip8 = vzip_s8(a,b); + const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); + + kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); + const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); + + const uint8x8x2_t zip8 = vzip_u8(a,b); + const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); + + kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::zip_in_place(kernel.packet[0], kernel.packet[1]); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const int64x1_t tmp[2][2] = { + { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, + { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]); +#endif +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ +#if EIGEN_ARCH_ARM64 + const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = tmp1; +#else + const uint64x1_t tmp[2][2] = { + { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, + { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) } + }; + + kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]); + kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b) +{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) +{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) +{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) +{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) +{ return vbsl_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b) +{ return vbslq_u8(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) +{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) +{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) +{ return vbsl_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) +{ return vbslq_u16(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) +{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) +{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) +{ return vbsl_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) +{ return vbslq_u32(mask, a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) +{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) +{ return vbslq_u64(mask, a, b); } + +// Use armv8 rounding intinsics if available. +#if EIGEN_ARCH_ARMV8 +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) +{ return vrndn_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) +{ return vrndnq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ return vrndm_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ return vrndmq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ return vrndp_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ return vrndpq_f32(a); } + +#else + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet2f limit = pset1(static_cast(1<<23)); + const Packet2f abs_a = pabs(a); + Packet2f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0f); + Packet2f tmp = print(a); + // If greater, subtract one. + Packet2f mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) +{ + const Packet2f cst_1 = pset1(1.0); + Packet2f tmp = print(a); + // If smaller, add one. + Packet2f mask = pcmp_lt(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +#endif + +/** + * Computes the integer square root + * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result + * and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument + * value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf . + */ +template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) { + uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a)); + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return vget_lane_u32(vreinterpret_u32_u8(res), 0); +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) { + uint8x8_t res = vdup_n_u8(0); + uint8x8_t add = vdup_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x8_t temp = vorr_u8(res, add); + res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res); + add = vshr_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { + uint8x16_t res = vdupq_n_u8(0); + uint8x16_t add = vdupq_n_u8(0x8); + for (int i = 0; i < 4; i++) + { + const uint8x16_t temp = vorrq_u8(res, add); + res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res); + add = vshrq_n_u8(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) { + uint16x4_t res = vdup_n_u16(0); + uint16x4_t add = vdup_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x4_t temp = vorr_u16(res, add); + res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res); + add = vshr_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { + uint16x8_t res = vdupq_n_u16(0); + uint16x8_t add = vdupq_n_u16(0x80); + for (int i = 0; i < 8; i++) + { + const uint16x8_t temp = vorrq_u16(res, add); + res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res); + add = vshrq_n_u16(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) { + uint32x2_t res = vdup_n_u32(0); + uint32x2_t add = vdup_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x2_t temp = vorr_u32(res, add); + res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res); + add = vshr_n_u32(add, 1); + } + return res; +} +/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) +template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { + uint32x4_t res = vdupq_n_u32(0); + uint32x4_t add = vdupq_n_u32(0x8000); + for (int i = 0; i < 16; i++) + { + const uint32x4_t temp = vorrq_u32(res, add); + res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res); + add = vshrq_n_u32(add, 1); + } + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + // Compute approximate reciprocal sqrt. + Packet4f x = vrsqrteq_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); + const Packet4f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { + // Compute approximate reciprocal sqrt. + Packet2f x = vrsqrte_f32(a); + // Do Newton iterations for 1/sqrt(x). + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); + const Packet2f infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +// Unfortunately vsqrt_f32 is only available for A64. +#if EIGEN_ARCH_ARM64 +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } +#else +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + const Packet4f infinity = pset1(NumTraits::infinity()); + const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { + const Packet2f infinity = pset1(NumTraits::infinity()); + const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); + return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +} +#endif + +//---------- bfloat16 ---------- +// TODO: Add support for native armv8.6-a bfloat16_t + +// TODO: Guard if we have native bfloat16 support +typedef eigen_packet_wrapper Packet4bf; + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4bf type; + typedef Packet4bf half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 + }; +}; + +template<> struct unpacket_traits +{ + typedef bfloat16 type; + typedef Packet4bf half; + enum + { + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +namespace detail { +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} +} // namespace detail + +EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) +{ + // See the scalar implemention in BFloat16.h for a comprehensible explanation + // of this fast rounding algorithm + Packet4ui input = reinterpret_cast(p); + + // lsb = (input >> 16) & 1 + Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); + + // rounding_bias = 0x7fff + lsb + Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff)); + + // input += rounding_bias + input = vaddq_u32(input, rounding_bias); + + // input = input >> 16 + input = vshrq_n_u32(input, 16); + + // Replace float-nans by bfloat16-nans, that is 0x7fc0 + const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0); + const Packet4ui mask = vceqq_f32(p, p); + input = vbslq_u32(mask, input, bf16_nan); + + // output = static_cast(input) + return vmovn_u32(input); +} + +EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) +{ + return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); +} + +EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { + return vmovn_u32(vreinterpretq_u32_f32(p)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { + return pset1(from.value); +} + +template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) +{ + return pload(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) +{ + return ploadu(reinterpret_cast(from)); +} + +template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet4bf& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast(to), from); +} + +template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) +{ + return ploaddup(reinterpret_cast(from)); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { + return F32ToBf16(pabs(Bf16ToF32(a))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmin(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmin(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template <> EIGEN_STRONG_INLINE Packet4bf pmax(const Packet4bf &a, + const Packet4bf &b) +{ + return F32ToBf16(pmax(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) +{ + return F32ToBf16(plset(static_cast(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { + return por(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { + return pxor(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { + return pand(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { + return pandnot(a, b); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, + const Packet4bf& b) +{ + return pselect(mask, a, b); +} + +template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) +{ + return F32ToBf16(print(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pfloor(const Packet4bf& a) +{ + return F32ToBf16(pfloor(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pceil(const Packet4bf& a) +{ + return F32ToBf16(pceil(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4bf padd(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(padd(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf psub(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(psub(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pmul(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, const Packet4bf& b) { + return F32ToBf16(pdiv(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> +EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) +{ + return pgather(reinterpret_cast(from), stride); +} + +template<> +EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) +{ + pscatter(reinterpret_cast(to), from, stride); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) +{ + return static_cast(predux(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet4bf& a) +{ + return static_cast(predux_max(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet4bf& a) +{ + return static_cast(predux_min(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a) +{ + return static_cast(predux_mul(Bf16ToF32(a))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) +{ + return preverse(a); +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + detail::ptranspose_impl(kernel); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff(const Packet4bf& a, const Packet4bf& b) +{ + return F32ToBf16(pabsdiff(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); +} + +template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) +{ + return pxor(a, pset1(static_cast(0x8000))); +} + +//---------- double ---------- + +// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. +// Confirmed at least with __apple_build_version__ = 6000054. +#ifdef __apple_build_version__ +// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. +// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with +// major toolchain updates. +#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) +#else +#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 +#endif + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + +// Bug 907: workaround missing declarations of the following two functions in the ADK +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } + +template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } + +typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; + +// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) +// Currently used in LU/arch/InverseSize4.h to enable a shared implementation +// for fast inversion of matrices of size 4. +EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) +{ + const double* a = reinterpret_cast(&m); + const double* b = reinterpret_cast(&n); + Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + return res; +} + +EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) +{ + return shuffle(a, b, mask); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 0); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) +{ + return shuffle(a, b, 3); +} +#define vec2d_duplane(a, p) \ + vdupq_laneq_f64(a, p) + +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum + { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = 0, + HasErf = 0 + }; +}; + +template<> struct unpacket_traits +{ + typedef double type; + typedef Packet2d half; + typedef Packet2l integer_packet; + enum + { + size = 2, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) +{ + const double c[] = {0.0,1.0}; + return vaddq_f64(pset1(a), vld1q_f64(c)); +} + +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ + const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + return padd(a, pxor(mask, b)); +} + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } + +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vfmaq_f64(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) +{ return vmlaq_f64(c,a,b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); } + +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return pmin(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return pmax(a, b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) +{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return vld1q_dup_f64(from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather(const double* from, Index stride) +{ + Packet2d res = pset1(0.0); + res = vld1q_lane_f64(from + 0*stride, res, 0); + res = vld1q_lane_f64(from + 1*stride, res, 1); + return res; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(double* to, const Packet2d& from, Index stride) +{ + vst1q_lane_f64(to + stride*0, from, 0); + vst1q_lane_f64(to + stride*1, from, 1); +} + +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } + +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a,0); } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif + +// Other reduction functions: +// mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif + +// min +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } + +// max +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } + + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) +{ + const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + + kernel.packet[0] = tmp1; + kernel.packet[1] = tmp2; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) +{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) +{ return vrndnq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ return vrndmq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ return vrndpq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) +{ return pldexp_generic(a, exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) +{ return pfrexp_generic(a,exponent); } + +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) +{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); } + +template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + // Compute approximate reciprocal sqrt. + Packet2d x = vrsqrteq_f64(a); + // Do Newton iterations for 1/sqrt(x). + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x); + const Packet2d infinity = pset1(NumTraits::infinity()); + return pselect(pcmp_eq(a, pzero(a)), infinity, x); +} + +template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } + +#endif // EIGEN_ARCH_ARM64 + +// Do we have an fp16 types and supporting Neon intrinsics? +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +typedef float16x4_t Packet4hf; +typedef float16x8_t Packet8hf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8hf type; + typedef Packet4hf half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, - HasHalfPacket=0, // Packet2f intrinsics not implemented yet - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, - HasSqrt = 0 + size = 8, + HasHalfPacket = 1, + + HasCmp = 1, + HasCast = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasInsert = 1, + HasReduxp = 1, + HasDiv = 1, + HasFloor = 1, + HasCeil = 1, + HasRint = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasRsqrt = 1, + HasErf = EIGEN_FAST_MATH, + HasBessel = 0, // Issues with accuracy. + HasNdtri = 0 }; }; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4i type; - typedef Packet4i half; // Packet2i intrinsics not implemented yet + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket=0 // Packet2i intrinsics not implemented yet - // FIXME check the Has* + size = 4, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM -// workaround gcc 4.2, 4.3 and 4.4 compilatin issue -EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } -#endif - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; - -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet4hf half; + enum { + size = 8, + alignment = Aligned16, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) -{ - const float32_t f[] = {0, 1, 2, 3}; - Packet4f countdown = vld1q_f32(f); - return vaddq_f32(pset1(a), countdown); +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4(const Packet8hf& a) { + return vadd_f16(vget_low_f16(a), vget_high_f16(a)); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) -{ - const int32_t i[] = {0, 1, 2, 3}; - Packet4i countdown = vld1q_s32(i); - return vaddq_s32(pset1(a), countdown); + +template <> +EIGEN_STRONG_INLINE Packet8hf pset1(const Eigen::half& from) { + return vdupq_n_f16(from.x); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pset1(const Eigen::half& from) { + return vdup_n_f16(from.x); +} -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet8hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7}; + Packet8hf countdown = vld1q_f16(f); + return vaddq_f16(pset1(a), countdown); +} -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template <> +EIGEN_STRONG_INLINE Packet4hf plset(const Eigen::half& a) { + const float16_t f[] = {0, 1, 2, 3}; + Packet4hf countdown = vld1_f16(f); + return vadd_f16(pset1(a), countdown); +} -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template <> +EIGEN_STRONG_INLINE Packet8hf padd(const Packet8hf& a, const Packet8hf& b) { + return vaddq_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf padd(const Packet4hf& a, const Packet4hf& b) { + return vadd_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdivq_f32(a,b); -#else - Packet4f inv, restep, div; +template <> +EIGEN_STRONG_INLINE Packet8hf psub(const Packet8hf& a, const Packet8hf& b) { + return vsubq_f16(a, b); +} - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpeq_f32(b); +template <> +EIGEN_STRONG_INLINE Packet4hf psub(const Packet4hf& a, const Packet4hf& b) { + return vsub_f16(a, b); +} - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecpsq_f32(b, inv); - inv = vmulq_f32(restep, inv); +template <> +EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) { + return vnegq_f16(a); +} - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmulq_f32(a, inv); +template <> +EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) { + return vneg_f16(a); +} - return div; -#endif +template <> +EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) { + return a; } -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); +template <> +EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) { + return a; } -// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, -// then implements a slow software scalar fallback calling fmaf()! -// Filed LLVM bug: -// https://llvm.org/bugs/show_bug.cgi?id=27216 -#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) -// See bug 936. -// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. -// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -// MLA is not fused i.e. does 2 roundings. -// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): -// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { -#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM - // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, - // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on - // -march=armv7-a, that is a very common case. - // See e.g. this thread: - // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html - // Filed LLVM bug: - // https://llvm.org/bugs/show_bug.cgi?id=27219 - Packet4f r = c; - asm volatile( - "vmla.f32 %q[r], %q[a], %q[b]" - : [r] "+w" (r) - : [a] "w" (a), - [b] "w" (b) - : ); - return r; -#else - return vmlaq_f32(c,a,b); -#endif +template <> +EIGEN_STRONG_INLINE Packet8hf pmul(const Packet8hf& a, const Packet8hf& b) { + return vmulq_f16(a, b); } -#endif -// No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pmul(const Packet4hf& a, const Packet4hf& b) { + return vmul_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet8hf pdiv(const Packet8hf& a, const Packet8hf& b) { + return vdivq_f16(a, b); +} -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf pdiv(const Packet4hf& a, const Packet4hf& b) { + return vdiv_f16(a, b); +} -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) { + return vfmaq_f16(c, a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) { + return vfma_f16(c, a, b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { + return vminq_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { + return vmin_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet4hf pmin(const Packet4hf& a, const Packet4hf& b) { return pmin(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet8hf pmin(const Packet8hf& a, const Packet8hf& b) { return pmin(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - float32x2_t lo, hi; - lo = vld1_dup_f32(from); - hi = vld1_dup_f32(from+1); - return vcombine_f32(lo, hi); +template <> +EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { + return vmaxq_f16(a, b); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) -{ - int32x2_t lo, hi; - lo = vld1_dup_s32(from); - hi = vld1_dup_s32(from+1); - return vcombine_s32(lo, hi); + +template <> +EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { + return vmax_f16(a, b); } -template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +#ifdef __ARM_FEATURE_NUMERIC_MAXMIN +// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems). +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); } +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); } +#endif -template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE Packet4hf pmax(const Packet4hf& a, const Packet4hf& b) { return pmax(a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - Packet4f res = pset1(0.f); - res = vsetq_lane_f32(from[0*stride], res, 0); - res = vsetq_lane_f32(from[1*stride], res, 1); - res = vsetq_lane_f32(from[2*stride], res, 2); - res = vsetq_lane_f32(from[3*stride], res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) -{ - Packet4i res = pset1(0); - res = vsetq_lane_s32(from[0*stride], res, 0); - res = vsetq_lane_s32(from[1*stride], res, 1); - res = vsetq_lane_s32(from[2*stride], res, 2); - res = vsetq_lane_s32(from[3*stride], res, 3); - return res; -} +template<> EIGEN_STRONG_INLINE Packet8hf pmax(const Packet8hf& a, const Packet8hf& b) { return pmax(a, b); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - to[stride*0] = vgetq_lane_f32(from, 0); - to[stride*1] = vgetq_lane_f32(from, 1); - to[stride*2] = vgetq_lane_f32(from, 2); - to[stride*3] = vgetq_lane_f32(from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) -{ - to[stride*0] = vgetq_lane_s32(from, 0); - to[stride*1] = vgetq_lane_s32(from, 1); - to[stride*2] = vgetq_lane_s32(from, 2); - to[stride*3] = vgetq_lane_s32(from, 3); -} +#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \ + return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \ + } -template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \ + template <> \ + EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \ + return vreinterpret_f16_u16(vc##name##_f16(a, b)); \ + } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +EIGEN_MAKE_ARM_FP16_CMP_8(eq) +EIGEN_MAKE_ARM_FP16_CMP_8(lt) +EIGEN_MAKE_ARM_FP16_CMP_8(le) -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - float32x2_t a_lo, a_hi; - Packet4f a_r64; +EIGEN_MAKE_ARM_FP16_CMP_4(eq) +EIGEN_MAKE_ARM_FP16_CMP_4(lt) +EIGEN_MAKE_ARM_FP16_CMP_4(le) - a_r64 = vrev64q_f32(a); - a_lo = vget_low_f32(a_r64); - a_hi = vget_high_f32(a_r64); - return vcombine_f32(a_hi, a_lo); +#undef EIGEN_MAKE_ARM_FP16_CMP_8 +#undef EIGEN_MAKE_ARM_FP16_CMP_4 + +template <> +EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b))); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - int32x2_t a_lo, a_hi; - Packet4i a_r64; - a_r64 = vrev64q_s32(a); - a_lo = vget_low_s32(a_r64); - a_hi = vget_high_s32(a_r64); - return vcombine_s32(a_hi, a_lo); +template <> +EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b))); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template <> +EIGEN_STRONG_INLINE Packet8hf print(const Packet8hf& a) +{ return vrndnq_f16(a); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf print(const Packet4hf& a) +{ return vrndn_f16(a); } - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - sum = vpadd_f32(a_lo, a_hi); - sum = vpadd_f32(sum, sum); - return vget_lane_f32(sum, 0); -} +template <> +EIGEN_STRONG_INLINE Packet8hf pfloor(const Packet8hf& a) +{ return vrndmq_f16(a); } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - float32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4f sum1, sum2, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf pfloor(const Packet4hf& a) +{ return vrndm_f16(a); } - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_f32(vecs[0], vecs[2]); - vtrn2 = vzipq_f32(vecs[1], vecs[3]); - res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); +template <> +EIGEN_STRONG_INLINE Packet8hf pceil(const Packet8hf& a) +{ return vrndpq_f16(a); } - // Do the addition of the resulting vectors - sum1 = vaddq_f32(res1.val[0], res1.val[1]); - sum2 = vaddq_f32(res2.val[0], res2.val[1]); - sum = vaddq_f32(sum1, sum2); +template <> +EIGEN_STRONG_INLINE Packet4hf pceil(const Packet4hf& a) +{ return vrndp_f16(a); } - return sum; +template <> +EIGEN_STRONG_INLINE Packet8hf psqrt(const Packet8hf& a) { + return vsqrtq_f16(a); } -template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf psqrt(const Packet4hf& a) { + return vsqrt_f16(a); +} - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - sum = vpadd_s32(a_lo, a_hi); - sum = vpadd_s32(sum, sum); - return vget_lane_s32(sum, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf pand(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); } -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - int32x4x2_t vtrn1, vtrn2, res1, res2; - Packet4i sum1, sum2, sum; +template <> +EIGEN_STRONG_INLINE Packet4hf pand(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - vtrn1 = vzipq_s32(vecs[0], vecs[2]); - vtrn2 = vzipq_s32(vecs[1], vecs[3]); - res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); - res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); +template <> +EIGEN_STRONG_INLINE Packet8hf por(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} - // Do the addition of the resulting vectors - sum1 = vaddq_s32(res1.val[0], res1.val[1]); - sum2 = vaddq_s32(res2.val[0], res2.val[1]); - sum = vaddq_s32(sum1, sum2); +template <> +EIGEN_STRONG_INLINE Packet4hf por(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - return sum; +template <> +EIGEN_STRONG_INLINE Packet8hf pxor(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); } -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, prod; +template <> +EIGEN_STRONG_INLINE Packet4hf pxor(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); +} - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_f32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_f32(prod, vrev64_f32(prod)); +template <> +EIGEN_STRONG_INLINE Packet8hf pandnot(const Packet8hf& a, const Packet8hf& b) { + return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))); +} - return vget_lane_f32(prod, 0); +template <> +EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packet4hf& b) { + return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b))); } -template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, prod; - // Get a_lo = |a1|a2| and a_hi = |a3|a4| - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| - prod = vmul_s32(a_lo, a_hi); - // Multiply prod with its swapped value |a2*a4|a1*a3| - prod = vmul_s32(prod, vrev64_s32(prod)); +template <> +EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} - return vget_lane_s32(prod, 0); +template <> +EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, min; +template <> +EIGEN_STRONG_INLINE Packet8hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +} - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - min = vpmin_f32(a_lo, a_hi); - min = vpmin_f32(min, min); +template <> +EIGEN_STRONG_INLINE Packet4hf ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); +} - return vget_lane_f32(min, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf ploaddup(const Eigen::half* from) { + Packet8hf packet; + packet[0] = from[0].x; + packet[1] = from[0].x; + packet[2] = from[1].x; + packet[3] = from[1].x; + packet[4] = from[2].x; + packet[5] = from[2].x; + packet[6] = from[3].x; + packet[7] = from[3].x; + return packet; } -template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, min; +template <> +EIGEN_STRONG_INLINE Packet4hf ploaddup(const Eigen::half* from) { + float16x4_t packet; + float16_t* tmp; + tmp = (float16_t*)&packet; + tmp[0] = from[0].x; + tmp[1] = from[0].x; + tmp[2] = from[1].x; + tmp[3] = from[1].x; + return packet; +} - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - min = vpmin_s32(a_lo, a_hi); - min = vpmin_s32(min, min); - - return vget_lane_s32(min, 0); +template <> +EIGEN_STRONG_INLINE Packet8hf ploadquad(const Eigen::half* from) { + Packet4hf lo, hi; + lo = vld1_dup_f16(reinterpret_cast(from)); + hi = vld1_dup_f16(reinterpret_cast(from+1)); + return vcombine_f16(lo, hi); } -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - float32x2_t a_lo, a_hi, max; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); } - a_lo = vget_low_f32(a); - a_hi = vget_high_f32(a); - max = vpmax_f32(a_lo, a_hi); - max = vpmax_f32(max, max); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); } - return vget_lane_f32(max, 0); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) { + return vbslq_f16(vreinterpretq_u16_f16(mask), a, b); } -template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) -{ - int32x2_t a_lo, a_hi, max; - - a_lo = vget_low_s32(a); - a_hi = vget_high_s32(a); - max = vpmax_s32(a_lo, a_hi); - max = vpmax_s32(max, max); - - return vget_lane_s32(max, 0); -} - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet4f,vextq_f32) -PALIGN_NEON(1,Packet4f,vextq_f32) -PALIGN_NEON(2,Packet4f,vextq_f32) -PALIGN_NEON(3,Packet4f,vextq_f32) -PALIGN_NEON(0,Packet4i,vextq_s32) -PALIGN_NEON(1,Packet4i,vextq_s32) -PALIGN_NEON(2,Packet4i,vextq_s32) -PALIGN_NEON(3,Packet4i,vextq_s32) - -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) { + return vbsl_f16(vreinterpret_u16_f16(mask), a, b); } -//---------- double ---------- - -// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. -// Confirmed at least with __apple_build_version__ = 6000054. -#ifdef __apple_build_version__ -// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. -// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with -// major toolchain updates. -#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) -#else -#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 -#endif +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); } -#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); } -// Bug 907: workaround missing declarations of the following two functions in the ADK -// Defining these functions as templates ensures that if these intrinsics are -// already defined in arm_neon.h, then our workaround doesn't cause a conflict -// and has lower priority in overload resolution. -template -uint64x2_t vreinterpretq_u64_f64(T a) -{ - return (uint64x2_t) a; +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); } -template -float64x2_t vreinterpretq_f64_u64(T a) -{ - return (float64x2_t) a; +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); } -typedef float64x2_t Packet2d; -typedef float64x1_t Packet1d; - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket=0, - - HasDiv = 1, - // FIXME check the Has* - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 0, - HasSqrt = 0 - }; -}; +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); +} -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4hf& from) { + EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast(to), from); +} -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather(const Eigen::half* from, Index stride) { + Packet8hf res = pset1(Eigen::half(0.f)); + res = vsetq_lane_f16(from[0 * stride].x, res, 0); + res = vsetq_lane_f16(from[1 * stride].x, res, 1); + res = vsetq_lane_f16(from[2 * stride].x, res, 2); + res = vsetq_lane_f16(from[3 * stride].x, res, 3); + res = vsetq_lane_f16(from[4 * stride].x, res, 4); + res = vsetq_lane_f16(from[5 * stride].x, res, 5); + res = vsetq_lane_f16(from[6 * stride].x, res, 6); + res = vsetq_lane_f16(from[7 * stride].x, res, 7); + return res; +} -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) -{ - const double countdown_raw[] = {0.0,1.0}; - const Packet2d countdown = vld1q_f64(countdown_raw); - return vaddq_f64(pset1(a), countdown); +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather(const Eigen::half* from, Index stride) { + Packet4hf res = pset1(Eigen::half(0.f)); + res = vset_lane_f16(from[0 * stride].x, res, 0); + res = vset_lane_f16(from[1 * stride].x, res, 1); + res = vset_lane_f16(from[2 * stride].x, res, 2); + res = vset_lane_f16(from[3 * stride].x, res, 3); + return res; } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8hf& from, Index stride) { + to[stride * 0].x = vgetq_lane_f16(from, 0); + to[stride * 1].x = vgetq_lane_f16(from, 1); + to[stride * 2].x = vgetq_lane_f16(from, 2); + to[stride * 3].x = vgetq_lane_f16(from, 3); + to[stride * 4].x = vgetq_lane_f16(from, 4); + to[stride * 5].x = vgetq_lane_f16(from, 5); + to[stride * 6].x = vgetq_lane_f16(from, 6); + to[stride * 7].x = vgetq_lane_f16(from, 7); +} -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4hf& from, Index stride) { + to[stride * 0].x = vget_lane_f16(from, 0); + to[stride * 1].x = vget_lane_f16(from, 1); + to[stride * 2].x = vget_lane_f16(from, 2); + to[stride * 3].x = vget_lane_f16(from, 3); +} -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template <> +EIGEN_STRONG_INLINE void prefetch(const Eigen::half* addr) { + EIGEN_ARM_PREFETCH(addr); +} -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8hf& a) { + float16_t x[8]; + vst1q_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4hf& a) { + float16_t x[4]; + vst1_f16(x, a); + Eigen::half h; + h.x = x[0]; + return h; +} -#ifdef __ARM_FEATURE_FMA -// See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } -#else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } -#endif +template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) { + float16x4_t a_lo, a_hi; + Packet8hf a_r64; -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + a_r64 = vrev64q_f16(a); + a_lo = vget_low_f16(a_r64); + a_hi = vget_high_f16(a_r64); + return vcombine_f16(a_hi, a_lo); +} -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } +template <> +EIGEN_STRONG_INLINE Packet4hf preverse(const Packet4hf& a) { + return vrev64_f16(a); +} -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Packet8hf pabs(const Packet8hf& a) { + return vabsq_f16(a); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Packet4hf pabs(const Packet4hf& a) { + return vabs_f16(a); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet8hf& a) { + float16x4_t a_lo, a_hi, sum; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + sum = vpadd_f16(a_lo, a_hi); + sum = vpadd_f16(sum, sum); + sum = vpadd_f16(sum, sum); + + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ - return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet4hf& a) { + float16x4_t sum; + + sum = vpadd_f16(a, a); + sum = vpadd_f16(sum, sum); + Eigen::half h; + h.x = vget_lane_f16(sum, 0); + return h; } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8hf& a) { + float16x4_t a_lo, a_hi, prod; -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + prod = vmul_f16(a_lo, a_hi); + prod = vmul_f16(prod, vrev64_f16(prod)); -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - return vld1q_dup_f64(from); + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; } -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet4hf& a) { + float16x4_t prod; + prod = vmul_f16(a, vrev64_f16(a)); + Eigen::half h; + h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1)); + return h; +} -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - Packet2d res = pset1(0.0); - res = vsetq_lane_f64(from[0*stride], res, 0); - res = vsetq_lane_f64(from[1*stride], res, 1); - return res; +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8hf& a) { + float16x4_t a_lo, a_hi, min; + + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + min = vpmin_f16(a_lo, a_hi); + min = vpmin_f16(min, min); + min = vpmin_f16(min, min); + + Eigen::half h; + h.x = vget_lane_f16(min, 0); + return h; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - to[stride*0] = vgetq_lane_f64(from, 0); - to[stride*1] = vgetq_lane_f64(from, 1); + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmin_f16(a, a); + tmp = vpmin_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } -// FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8hf& a) { + float16x4_t a_lo, a_hi, max; -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + a_lo = vget_low_f16(a); + a_hi = vget_high_f16(a); + max = vpmax_f16(a_lo, a_hi); + max = vpmax_f16(max, max); + max = vpmax_f16(max, max); -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + Eigen::half h; + h.x = vget_lane_f16(max, 0); + return h; +} -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -// workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } -#endif +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet4hf& a) { + Packet4hf tmp; + tmp = vpmax_f16(a, a); + tmp = vpmax_f16(tmp, tmp); + Eigen::half h; + h.x = vget_lane_f16(tmp, 0); + return h; +} -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - float64x2_t trn1, trn2; + const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]); + const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]); - // NEON zip performs interleaving of the supplied vectors. - // We perform two interleaves in a row to acquire the transposed vector - trn1 = vzip1q_f64(vecs[0], vecs[1]); - trn2 = vzip2q_f64(vecs[0], vecs[1]); + const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0])); + const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1])); - // Do the addition of the resulting vectors - return vaddq_f64(trn1, trn2); + kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]); + kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]); + kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]); + kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]); } -// Other reduction functions: -// mul -#if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } -#else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } -#endif -// min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN16 float16x4x4_t tmp_x4; + float16_t* tmp = (float16_t*)&kernel; + tmp_x4 = vld4_f16(tmp); -// max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } - -// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, -// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 -#define PALIGN_NEON(Offset,Type,Command) \ -template<>\ -struct palign_impl\ -{\ - EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ - {\ - if (Offset!=0)\ - first = Command(first, second, Offset);\ - }\ -};\ - -PALIGN_NEON(0,Packet2d,vextq_f64) -PALIGN_NEON(1,Packet2d,vextq_f64) -#undef PALIGN_NEON - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; -} -#endif // EIGEN_ARCH_ARM64 + kernel.packet[0] = tmp_x4.val[0]; + kernel.packet[1] = tmp_x4.val[1]; + kernel.packet[2] = tmp_x4.val[2]; + kernel.packet[3] = tmp_x4.val[3]; +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + float16x8x2_t T_1[4]; + + T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]); + T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]); + T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]); + T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]); + + float16x8x2_t T_2[4]; + T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]); + T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]); + T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]); + T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]); + + float16x8x2_t T_3[4]; + T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]); + T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]); + T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]); + T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]); + + kernel.packet[0] = T_3[0].val[0]; + kernel.packet[1] = T_3[2].val[0]; + kernel.packet[2] = T_3[1].val[0]; + kernel.packet[3] = T_3[3].val[0]; + kernel.packet[4] = T_3[0].val[1]; + kernel.packet[5] = T_3[2].val[1]; + kernel.packet[6] = T_3[1].val[1]; + kernel.packet[7] = T_3[3].val[1]; +} +#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h new file mode 100644 index 00000000..54f97336 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -0,0 +1,1419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// Copyright (C) 2020 Antonio Sanchez +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_NEON_H +#define EIGEN_TYPE_CASTING_NEON_H + +namespace Eigen { + +namespace internal { + +//============================================================================== +// pcast, SrcType = float +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2f& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +// If float64 exists, first convert to that to keep as much precision as possible. +#if EIGEN_ARCH_ARM64 +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a))); +} +#else +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4f& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a))); +} +#endif // EIGEN_ARCH_ARM64 + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vcvtq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2f& a) { + return vcvt_s32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { + return vcvtq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2f& a) { + return vcvt_u32_f32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4f& a, const Packet4f& b) { + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2f& a, const Packet2f& b) { + return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const int16x4_t ab_s16 = pcast(a, b); + const int16x4_t cd_s16 = pcast(c, d); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4f& a, const Packet4f& b, const Packet4f& c, + const Packet4f& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2f& a, const Packet2f& b, const Packet2f& c, + const Packet2f& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = int8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16c& a) { + // Discard all but first two bytes. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16c& a) { + // Discard all but first 4 bytes. + return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8c& a) { + // Discard all but first 2 bytes. + return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16c& a) { + // Discard second half of input. + return vmovl_s8(vget_low_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8c& a) { + // Discard second half of input. + return vget_low_s16(vmovl_s8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { + return vreinterpretq_u16_s16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { + return vreinterpret_u16_s16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8c& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4c& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = uint8_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16uc& a) { + // Discard all but first two bytes. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16uc& a) { + // Discard all but first 4 bytes. + return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8uc& a) { + // Discard all but first 2 bytes. + return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet16uc& a) { + // Discard second half of input. + return vmovl_u8(vget_low_u8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet8uc& a) { + // Discard second half of input. + return vget_low_u16(vmovl_u8(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { + return vreinterpretq_s16_u16(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { + return vreinterpret_s16_u16(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8uc& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4uc& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { + return static_cast(a); +} + +//============================================================================== +// pcast, SrcType = int16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8s& a) { + // Discard second half of input. + return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4s& a) { + // Discard second half of input. + return vcvt_f32_s32(vget_low_s32(vmovl_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8s& a) { + // Discard all but first two values. + return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8s& a) { + // Discard second half of input. + return vmovl_s16(vget_low_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4s& a) { + // Discard second half of input. + return vget_low_s32(vmovl_s16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { + return vreinterpretq_u32_s32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { + return vreinterpret_u32_s32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8s& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4s& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_s8(vmovn_s16(a), vmovn_s16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_s16(vcombine_s16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8s& a, const Packet8s& b) { + return vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(a)), vmovn_u16(vreinterpretq_u16_s16(b))); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4s& a, const Packet4s& b) { + return vmovn_u16(vcombine_u16(vreinterpret_u16_s16(a), vreinterpret_u16_s16(b))); +} + +//============================================================================== +// pcast, SrcType = uint16_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet8us& a) { + // Discard second half of input. + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet4us& a) { + // Discard second half of input. + return vcvt_f32_u32(vget_low_u32(vmovl_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8us& a) { + // Discard all but first two values. + return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a)))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8us& a) { + // Discard second half of input. + return vmovl_u16(vget_low_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4us& a) { + // Discard second half of input. + return vget_low_u32(vmovl_u16(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { + return vreinterpretq_s32_u32(pcast(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { + return vreinterpret_s32_u32(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet8us& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet4us& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet8us& a, const Packet8us& b) { + return vcombine_u8(vmovn_u16(a), vmovn_u16(b)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet4us& a, const Packet4us& b) { + return vmovn_u16(vcombine_u16(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { + return vreinterpretq_s8_u8(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { + return vreinterpret_s8_u8(pcast(a, b)); +} + +//============================================================================== +// pcast, SrcType = int32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vcvtq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2i& a) { + return vcvt_f32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4i& a) { + // Discard second half of input. + return vmovl_s32(vget_low_s32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { + return vreinterpretq_u64_s64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4i& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2i& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_s16(vmovn_s32(a), vmovn_s32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_s32(vcombine_s32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4i& a, const Packet4i& b) { + return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b))); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2i& a, const Packet2i& b) { + return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const int16x8_t ab_s16 = pcast(a, b); + const int16x8_t cd_s16 = pcast(c, d); + return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b)); + const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d)); + return vmovn_s16(vcombine_s16(ab_s16, cd_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4i& a, const Packet4i& b, const Packet4i& c, + const Packet4i& d) { + const uint16x8_t ab_u16 = pcast(a, b); + const uint16x8_t cd_u16 = pcast(c, d); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2i& a, const Packet2i& b, const Packet2i& c, + const Packet2i& d) { + const uint16x4_t ab_u16 = pcast(a, b); + const uint16x4_t cd_u16 = pcast(c, d); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +//============================================================================== +// pcast, SrcType = uint32_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { + return vcvtq_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ui& a) { + return vcvt_f32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4ui& a) { + // Discard second half of input. + return vmovl_u32(vget_low_u32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { + return vreinterpretq_s64_u64(pcast(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4ui& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2ui& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet4ui& a, const Packet4ui& b) { + return vcombine_u16(vmovn_u32(a), vmovn_u32(b)); +} +template <> +EIGEN_STRONG_INLINE Packet4us pcast(const Packet2ui& a, const Packet2ui& b) { + return vmovn_u32(vcombine_u32(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { + return vreinterpretq_s16_u16(pcast(a, b)); +} +template <> +EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { + return vreinterpret_s16_u16(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b)); + const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d)); + return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16)); +} +template <> +EIGEN_STRONG_INLINE Packet8uc pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b)); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d)); + return vmovn_u16(vcombine_u16(ab_u16, cd_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, + const Packet4ui& d) { + return vreinterpretq_s8_u8(pcast(a, b, c, d)); +} +template <> +EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, + const Packet2ui& d) { + return vreinterpret_s8_u8(pcast(a, b, c, d)); +} + +//============================================================================== +// pcast, SrcType = int64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2l& a, const Packet2l& b) { + return vcvtq_f32_s32(vcombine_s32(vmovn_s64(a), vmovn_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2l& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_s32(vmovn_s64(a), vmovn_s64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2l& a, const Packet2l& b) { + return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2l& a, const Packet2l& b, const Packet2l& c, + const Packet2l& d, const Packet2l& e, const Packet2l& f, + const Packet2l& g, const Packet2l& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +//============================================================================== +// pcast, SrcType = uint64_t +//============================================================================== +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2ul& a, const Packet2ul& b) { + return vcvtq_f32_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2ul& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2ul& a, const Packet2ul& b) { + return vcombine_u32(vmovn_u64(a), vmovn_u64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { + return vreinterpretq_s32_u32(pcast(a, b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b))); + const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d))); + return vcombine_u16(ab_u16, cd_u16); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d) { + return vreinterpretq_s16_u16(pcast(a, b, c, d)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, + const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, + const Packet2ul& g, const Packet2ul& h) { + return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); +} + +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return vreinterpret_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return vreinterpret_f32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return vreinterpretq_f32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return vreinterpretq_f32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return vreinterpret_s8_u8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return vreinterpretq_s8_u8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return vreinterpret_u8_s8(a); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return vreinterpretq_u8_s8(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return vreinterpret_s16_u16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return vreinterpretq_s16_u16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return vreinterpret_u16_s16(a); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return vreinterpretq_u16_s16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return vreinterpret_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return vreinterpret_s32_u32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return vreinterpretq_s32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return vreinterpretq_s32_u32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return vreinterpret_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return vreinterpret_u32_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return vreinterpretq_u32_f32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return vreinterpretq_u32_s32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return vreinterpretq_s64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return vreinterpretq_u64_s64(a); +} + +#if EIGEN_ARCH_ARM64 + +//============================================================================== +// pcast/preinterpret, Double +//============================================================================== + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2d& a) { + return a; +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { + return vcvtq_s64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { + return vcvtq_u64_f64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4i pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet4ui pcast(const Packet2d& a, const Packet2d& b) { + return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8s pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const int32x4_t ab_s32 = pcast(a, b); + const int32x4_t cd_s32 = pcast(c, d); + return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet8us pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d) { + const uint32x4_t ab_u32 = pcast(a, b); + const uint32x4_t cd_u32 = pcast(c, d); + return vcombine_u16(vmovn_u32(ab_u32), vmovn_u32(cd_u32)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16c pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const int16x8_t abcd_s16 = pcast(a, b, c, d); + const int16x8_t efgh_s16 = pcast(e, f, g, h); + return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16uc pcast(const Packet2d& a, const Packet2d& b, const Packet2d& c, + const Packet2d& d, const Packet2d& e, const Packet2d& f, + const Packet2d& g, const Packet2d& h) { + const uint16x8_t abcd_u16 = pcast(a, b, c, d); + const uint16x8_t efgh_u16 = pcast(e, f, g, h); + return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { + // Discard second-half of input. + return vcvt_f64_f32(vget_low_f32(a)); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u8(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_s16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { + // Discard all but first two values. + return vcvt_f64_f32(pcast(vget_low_u16(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4i& a) { + // Discard second half of input. + return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet4ui& a) { + // Discard second half of input. + return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a))); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { + return vcvtq_f64_s64(a); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { + return vcvtq_f64_u64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return vreinterpretq_f64_s64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return vreinterpretq_f64_u64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return vreinterpretq_s64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return vreinterpretq_u64_f64(a); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return vreinterpretq_f64_s32(a); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return vreinterpretq_s32_f64(a); +} + +#endif // EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_NEON_H diff --git a/externals/eigen/Eigen/src/Core/arch/SSE/Complex.h b/externals/eigen/Eigen/src/Core/arch/SSE/Complex.h index 5607fe0a..8fe22da4 100644 --- a/externals/eigen/Eigen/src/Core/arch/SSE/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/SSE/Complex.h @@ -19,7 +19,7 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; + Packet4f v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -40,20 +40,33 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasBlend = 1 + HasBlend = 1 }; }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet4f as_real; + enum { + size=2, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } + template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); @@ -82,10 +95,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } @@ -93,19 +107,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; -#if EIGEN_GNUC_AT_MOST(4,2) - // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 - res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); -#elif EIGEN_GNUC_AT_LEAST(4,6) - // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6 - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wuninitialized" - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); - #pragma GCC diagnostic pop +#ifdef EIGEN_VECTORIZE_SSE3 + res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); #else - res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); + res.v = _mm_movelh_ps(res.v, res.v); #endif - return Packet2cf(_mm_movelh_ps(res.v,res.v)); + return res; } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -128,7 +136,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -152,113 +160,26 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset==1) - { - first.v = _mm_movehl_ps(first.v, first.v); - first.v = _mm_movelh_ps(first.v, second.v); - } - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a, pconj(b)); __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) -{ - return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); -} //---------- double ---------- @@ -266,7 +187,7 @@ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; + Packet2d v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -287,6 +208,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -296,7 +218,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d as_real; + enum { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } @@ -321,10 +254,11 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } +template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) @@ -340,7 +274,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { @@ -356,102 +290,17 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack return pfirst(a); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) -{ - return vecs[0]; -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } @@ -471,33 +320,32 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { - __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); - return Packet2cf(_mm_castpd_ps(result)); +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +{ + __m128 eq = _mm_cmpeq_ps(a.v, b.v); + return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); } -template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); + __m128d eq = _mm_cmpeq_pd(a.v, b.v); + return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); } -template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) -{ - return pset1(b); +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); } -template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) -{ - return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); } -template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) -{ - return pset1(b); +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); } } // end namespace internal - } // end namespace Eigen #endif // EIGEN_COMPLEX_SSE_H diff --git a/externals/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e..8736d0d6 100644 --- a/externals/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,7 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin, cos, exp, and log functions of this file come from +/* The sin and cos and functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -20,426 +20,57 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - - /* the smallest non denormalized float number */ - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - - - Packet4i emm0; - - Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN - Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); - - x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); - - /* keep only the fractional part */ - x = _mm_and_ps(x, p4f_inv_mant_mask); - x = _mm_or_ps(x, p4f_half); - - emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = pand(x, mask); - x = psub(x, p4f_1); - e = psub(e, pand(p4f_1, mask)); - x = padd(x, tmp); - - Packet4f x2 = pmul(x,x); - Packet4f x3 = pmul(x2,x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y , x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p4f_cephes_log_q1); - tmp = pmul(x2, p4f_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4f_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), - _mm_and_ps(iszero_mask, p4f_minus_inf)); +Packet4f plog(const Packet4f& _x) { + return plog_float(_x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) -{ - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - - _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); - _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); - - Packet4f tmp, fx; - Packet4i emm0; +Packet2d plog(const Packet2d& _x) { + return plog_double(_x); +} - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog2(const Packet4f& _x) { + return plog2_float(_x); +} - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d plog2(const Packet2d& _x) { + return plog2_double(_x); +} -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_ps(fx); -#else - emm0 = _mm_cvttps_epi32(fx); - tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, fx); - mask = _mm_and_ps(mask, p4f_1); - fx = psub(tmp, mask); -#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog1p(const Packet4f& _x) { + return generic_plog1p(_x); +} - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = _mm_cvttps_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_0x7f); - emm0 = _mm_slli_epi32(emm0, 23); - return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexpm1(const Packet4f& _x) { + return generic_expm1(_x); } + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& _x) +Packet4f pexp(const Packet4f& _x) { - Packet2d x = _x; - - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - - Packet2d tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); - /* express exp(x) as exp(g + n*log(2)) */ - fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); - -#ifdef EIGEN_VECTORIZE_SSE4_1 - fx = _mm_floor_pd(fx); -#else - emm0 = _mm_cvttpd_epi32(fx); - tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, fx); - mask = _mm_and_pd(mask, p2d_1); - fx = psub(tmp, mask); -#endif - - tmp = pmul(fx, p2d_cephes_exp_C1); - Packet2d z = pmul(fx, p2d_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet2d x2 = pmul(x,x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul (px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px,psub(qx,px)); - x = pmadd(p2d_2,x,p2d_1); - - // build 2^n - emm0 = _mm_cvttpd_epi32(fx); - emm0 = _mm_add_epi32(emm0, p4i_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); + return pexp_float(_x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& x) +{ + return pexp_double(x); +} template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, sign_bit, y; - - Packet4i emm0, emm2; - sign_bit = x; - /* take the absolute value */ - x = pabs(x); - - /* take the modulo */ - - /* extract the sign bit (upper one) */ - sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* store the integer part of y in mm0 */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - /* get the swap sign flag */ - emm0 = _mm_and_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& _x) { - Packet4f x = _x; - _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); - _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - - _EIGEN_DECLARE_CONST_Packet4i(1, 1); - _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); - _EIGEN_DECLARE_CONST_Packet4i(2, 2); - _EIGEN_DECLARE_CONST_Packet4i(4, 4); - - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); - _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); - _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); - _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI - - Packet4f xmm1, xmm2, xmm3, y; - Packet4i emm0, emm2; - - x = pabs(x); - - /* scale by 4/Pi */ - y = pmul(x, p4f_cephes_FOPI); - - /* get the integer part of y */ - emm2 = _mm_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, p4i_1); - emm2 = _mm_and_si128(emm2, p4i_not1); - y = _mm_cvtepi32_ps(emm2); - - emm2 = _mm_sub_epi32(emm2, p4i_2); - - /* get the swap sign flag */ - emm0 = _mm_andnot_si128(emm2, p4i_4); - emm0 = _mm_slli_epi32(emm0, 29); - /* get the polynom selection mask */ - emm2 = _mm_and_si128(emm2, p4i_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - - Packet4f sign_bit = _mm_castsi128_ps(emm0); - Packet4f poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" - x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = pmul(y, p4f_minus_cephes_DP1); - xmm2 = pmul(y, p4f_minus_cephes_DP2); - xmm3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, xmm1); - x = padd(x, xmm2); - x = padd(x, xmm3); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = p4f_coscof_p0; - Packet4f z = pmul(x,x); - - y = pmadd(y,z,p4f_coscof_p1); - y = pmadd(y,z,p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - Packet4f tmp = _mm_mul_ps(z, p4f_half); - y = psub(y, tmp); - y = padd(y, p4f_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - /* select the correct result from the two polynoms */ - y2 = _mm_and_ps(poly_mask, y2); - y = _mm_andnot_ps(poly_mask, y); - y = _mm_or_ps(y,y2); - - /* update the sign */ - return _mm_xor_ps(y, sign_bit); + return pcos_float(_x); } #if EIGEN_FAST_MATH @@ -455,17 +86,17 @@ Packet4f pcos(const Packet4f& _x) template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& _x) { - Packet4f half = pmul(_x, pset1(.5f)); - Packet4f denormal_mask = _mm_and_ps( - _mm_cmpge_ps(_x, _mm_setzero_ps()), - _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); + Packet4f minus_half_x = pmul(_x, pset1(-0.5f)); + Packet4f denormal_mask = pandnot( + pcmp_lt(_x, pset1((std::numeric_limits::min)())), + pcmp_lt(_x, pzero(_x))); // Compute approximate reciprocal sqrt. Packet4f x = _mm_rsqrt_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1(1.5f))); // Flush results for denormals to zero. - return _mm_andnot_ps(denormal_mask, pmul(_x,x)); + return pandnot(pmul(_x,x), denormal_mask); } #else @@ -478,41 +109,48 @@ Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet16b psqrt(const Packet16b& x) { return x; } + #if EIGEN_FAST_MATH template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); Packet4f neg_half = pmul(_x, p4f_minus_half); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); - Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - - // Fill in NaNs and Infs for the negative/zero entries. - Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); - Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); - Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), - _mm_and_ps(zero_mask, p4f_inf)); - - // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - - // Insert NaNs and Infs in all the right places. - return _mm_or_ps(x, infs_and_nans); + // Identity infinite, zero, negative and denormal arguments. + Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); + Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); + Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); + + // Compute an approximate result using the rsqrt intrinsic. + Packet4f y_approx = _mm_rsqrt_ps(_x); + + // Do a single step of Newton-Raphson iteration to improve the approximation. + // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). + // It is essential to evaluate the inner term like this because forming + // y_n^2 may over- or underflow. + Packet4f y_newton = pmul( + y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); + + // Select the result of the Newton-Raphson step for positive normal arguments. + // For other arguments, choose the output of the intrinsic. This will + // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if + // x is zero or a positive denormalized float (equivalent to flushing positive + // denormalized inputs to zero). + return pselect(not_normal_finite_mask, y_approx, y_newton); } #else template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { - // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. + // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation. return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); } @@ -520,7 +158,6 @@ Packet4f prsqrt(const Packet4f& x) { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); } @@ -548,7 +185,7 @@ double sqrt(const double &x) { #if EIGEN_COMP_GNUC_STRICT // This works around a GCC bug generating poor code for _mm_sqrt_pd - // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b + // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970 return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); diff --git a/externals/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/SSE/PacketMath.h index 3832de14..db102c73 100644 --- a/externals/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/SSE/PacketMath.h @@ -18,63 +18,93 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) +// 32 bits => 8 registers +// 64 bits => 16 registers #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif #endif -#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. // One solution is to increase ABI version using -fabi-version=4 (or greater). // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper // structure: -template -struct eigen_packet_wrapper -{ - EIGEN_ALWAYS_INLINE operator T&() { return m_val; } - EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; typedef eigen_packet_wrapper<__m128> Packet4f; -typedef eigen_packet_wrapper<__m128i> Packet4i; typedef eigen_packet_wrapper<__m128d> Packet2d; #else typedef __m128 Packet4f; -typedef __m128i Packet4i; typedef __m128d Packet2d; #endif +typedef eigen_packet_wrapper<__m128i, 0> Packet4i; +typedef eigen_packet_wrapper<__m128i, 1> Packet16b; + template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template +struct shuffle_mask{ + enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) }; +}; +// TODO: change the implementation of all swizzle* ops from macro to template, #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) + Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask::mask)))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4i(_mm_shuffle_epi32( v, (shuffle_mask::mask))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask)))) + #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) + Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask::mask))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) + Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask::mask))))) + +EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movelh_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_movehl_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpacklo_ps(a,b)); +} +EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) +{ + return Packet4f(_mm_unpackhi_ps(a,b)); +} +#define vec4f_duplane(a,p) \ + vec4f_swizzle2(a,a,p,p,p,p) + +#define vec2d_swizzle2(a,b,mask) \ + Packet2d(_mm_shuffle_pd(a,b,mask)) + +EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpacklo_pd(a,b)); +} +EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) +{ + return Packet2d(_mm_unpackhi_pd(a,b)); +} +#define vec2d_duplane(a,p) \ + vec2d_swizzle2(a,a,(p<<1)|p) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -83,7 +113,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) + const Packet4f p4f_##NAME = pset1frombits(X) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -92,36 +122,41 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going // to leverage AVX instructions. #ifndef EIGEN_VECTORIZE_AVX -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 0, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, + HasCmp = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasNdtri = 1, + HasExp = 1, + HasBessel = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasBlend = 1 - + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasBlend = 1, + HasCeil = 1, + HasFloor = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , HasRound = 1, - HasFloor = 1, - HasCeil = 1 #endif + HasRint = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; enum { @@ -130,18 +165,19 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 0, + HasCmp = 1, HasDiv = 1, + HasLog = 1, HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1 - + HasBlend = 1, + HasFloor = 1, + HasCeil = 1, #ifdef EIGEN_VECTORIZE_SSE4_1 - , HasRound = 1, - HasFloor = 1, - HasCeil = 1 #endif + HasRint = 1 }; }; #endif @@ -154,13 +190,56 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, + HasShift = 1, HasBlend = 1 }; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet16b type; + typedef Packet16b half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + HasHalfPacket = 0, + size=16, + + HasAdd = 1, + HasSub = 1, + HasShift = 0, + HasMul = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSqrt = 1 + }; +}; + +template<> struct unpacket_traits { + typedef float type; + typedef Packet4f half; + typedef Packet4i integer_packet; + enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef double type; + typedef Packet2d half; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef int type; + typedef Packet4i half; + enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false}; +}; +template<> struct unpacket_traits { + typedef bool type; + typedef Packet16b half; + enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; +}; #ifndef EIGEN_VECTORIZE_AVX template<> struct scalar_div_cost { enum { value = 7 }; }; @@ -179,6 +258,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif +template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { return _mm_set1_epi8(static_cast(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } +template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); } + +template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); } +template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); } +template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); } + +template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } +template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } +template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) @@ -190,7 +281,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } #endif - + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } @@ -199,9 +290,34 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b psub(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); +template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_ps(a,b); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0)); + return padd(a, pxor(mask, b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); +template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + return _mm_addsub_pd(a,b); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); + return padd(a, pxor(mask, b)); +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { @@ -218,6 +334,11 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } +template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) +{ + return psub(pset1(false), a); +} + template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -240,18 +361,126 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const #endif } +template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { + return _mm_blendv_ps(b,a,mask); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask))); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); } + +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + return _mm_blendv_epi8(b,a,mask); +} +#else +template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) { + Packet16b a_part = _mm_and_si128(mask, a); + Packet16b b_part = _mm_andnot_si128(mask, b); + return _mm_or_si128(a_part, b_part); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } +template<> EIGEN_STRONG_INLINE Packet16b ptrue(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); } +template<> EIGEN_STRONG_INLINE Packet4f +ptrue(const Packet4f& a) { + Packet4i b = _mm_castps_si128(a); + return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); +} +template<> EIGEN_STRONG_INLINE Packet2d +ptrue(const Packet2d& a) { + Packet4i b = _mm_castpd_si128(a); + return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); +} + + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pand(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b por(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pxor(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } + +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -263,8 +492,45 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet4f res; + asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet4f res = b; + asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + #ifdef EIGEN_VECTORIZE_AVX + Packet2d res; + asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); + #else + Packet2d res = b; + asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + #endif + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -276,36 +542,180 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(a, b); + return pselect(not_nan_mask_a, m, b); +} + +template +EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) { + // In this implementation, we take advantage of the fact that pmin/pmax for SSE + // always return a if either a or b is NaN. + Packet not_nan_mask_a = pcmp_eq(a, a); + Packet m = op(b, a); + return pselect(not_nan_mask_a, m, a); +} + +// Add specializations for min/max with prescribed NaN progation. +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_numbers(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmin); +} +template<> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + return pminmax_propagate_nan(a, b, pmax); +} +template<> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + return pminmax_propagate_nan(a, b, pmax); +} + +template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); } +template EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); } + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} + #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. + const Packet4f mask = pset1frombits(0x80000000u); + const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); + return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); + const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); + return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); +} + +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } -#endif +#else +template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const Packet4f limit = pset1(static_cast(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const Packet2d limit = pset1(static_cast(1ull<<52)); + const Packet2d abs_a = pabs(a); + Packet2d r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If greater, subtract one. + Packet4f mask = _mm_cmpgt_ps(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If greater, subtract one. + Packet2d mask = _mm_cmpgt_pd(tmp, a); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + const Packet4f cst_1 = pset1(1.0f); + Packet4f tmp = print(a); + // If smaller, add one. + Packet4f mask = _mm_cmplt_ps(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} + +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) +{ + const Packet2d cst_1 = pset1(1.0); + Packet2d tmp = print(a); + // If smaller, add one. + Packet2d mask = _mm_cmplt_pd(tmp, a); + mask = pand(mask, cst_1); + return padd(tmp, mask); +} +#endif template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b pload(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -340,6 +750,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet16b ploadu(const bool* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -355,13 +769,32 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec4i_swizzle1(tmp, 0, 0, 1, 1); } +// Loads 8 bools from memory and returns the packet +// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7} +template<> EIGEN_STRONG_INLINE Packet16b ploaddup(const bool* from) +{ + __m128i tmp = _mm_castpd_si128(pload1(reinterpret_cast(from))); + return _mm_unpacklo_epi8(tmp, tmp); +} + +// Loads 4 bools from memory and returns the packet +// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3} +template<> EIGEN_STRONG_INLINE Packet16b +ploadquad(const bool* from) { + __m128i tmp = _mm_castps_si128(pload1(reinterpret_cast(from))); + tmp = _mm_unpacklo_epi8(tmp, tmp); + return _mm_unpacklo_epi16(tmp, tmp); +} + template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstore(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -374,7 +807,15 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); - } +} + +template<> EIGEN_DEVICE_FUNC inline Packet16b pgather(const bool* from, Index stride) +{ + return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride], + from[11*stride], from[10*stride], from[9*stride], from[8*stride], + from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { @@ -395,6 +836,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } +template<> EIGEN_DEVICE_FUNC inline void pscatter(bool* to, const Packet16b& from, Index stride) +{ + to[4*stride*0] = _mm_cvtsi128_si32(from); + to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} + // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) @@ -409,10 +858,16 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 +typedef const void * SsePrefetchPtrType; +#else +typedef const char * SsePrefetchPtrType; +#endif + #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 @@ -431,32 +886,62 @@ template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { retu template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } #endif +template<> EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast(x & 1); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ return _mm_shuffle_ps(a,a,0x1B); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return _mm_shuffle_pd(a,a,0x1); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) { +#ifdef EIGEN_VECTORIZE_SSSE3 + __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm_shuffle_epi8(a, mask); +#else + Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); + tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8)); +#endif +} -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); - return _mm_and_ps(a,mask); +template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { + return pfrexp_generic(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); - return _mm_and_pd(a,mask); + +// Extract exponent without existence of Packet2l. +template<> +EIGEN_STRONG_INLINE +Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) { + const Packet2d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52); + return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) -{ - #ifdef EIGEN_VECTORIZE_SSSE3 - return _mm_abs_epi32(a); - #else - Packet4i aux = _mm_srai_epi32(a,31); - return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); - #endif + +template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { + return pfrexp_generic(a, exponent); +} + +template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { + return pldexp_generic(a,exponent); +} + +// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well +// supported by SSE, and has more range than is needed for exponents. +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent); + + // Convert e to integer and swizzle to low-order bits. + const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3); + + // Split 2^e into four factors and multiply: + const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023); + Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4) + Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(ei, b), b), b); // e - 3b + c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; } // with AVX, the default implementations based on pload1 are faster @@ -499,38 +984,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); } -#ifdef EIGEN_VECTORIZE_SSE3 -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_hadd_pd(vecs[0], vecs[1]); -} - -#else -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - Packet4f tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); -} -#endif // SSE3 - template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures @@ -556,38 +1009,28 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) } #ifdef EIGEN_VECTORIZE_SSSE3 -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp0 = _mm_hadd_epi32(a,a); return pfirst(_mm_hadd_epi32(tmp0,tmp0)); } + #else template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } +#endif -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i tmp0, tmp1, tmp2; - tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - tmp0 = _mm_add_epi32(tmp0, tmp1); - tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - tmp1 = _mm_add_epi32(tmp1, tmp2); - tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); - tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); - return _mm_add_epi32(tmp0, tmp2); +template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { + Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); + return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); } -#endif + // Other reduction functions: + // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { @@ -605,7 +1048,13 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; + return (aux[0] * aux[1]) * (aux[2] * aux[3]); +} + +template<> EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) { + Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a)); + return ((pfirst(tmp) == 0x01010101) && + (pfirst(_mm_shuffle_epi32(tmp, 1)) == 0x01010101)); } // min @@ -660,113 +1109,16 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } -#if EIGEN_COMP_GNUC -// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +// not needed yet +// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) // { -// Packet4f res = b; -// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); -// return res; +// return _mm_movemask_ps(x) == 0xF; // } -// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) -// { -// Packet4i res = a; -// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); -// return res; -// } -#endif - -#ifdef EIGEN_VECTORIZE_SSSE3 -// SSSE3 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset!=0) - first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset!=0) - first = _mm_alignr_epi8(second,first, Offset*4); - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); - } -}; -#else -// SSE2 versions -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - if (Offset==1) - { - first = _mm_move_ss(first,second); - first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); - } - else if (Offset==2) - { - first = _mm_movehl_ps(first,first); - first = _mm_movelh_ps(first,second); - } - else if (Offset==3) - { - first = _mm_move_ss(first,second); - first = _mm_shuffle_ps(first,second,0x93); - } - } -}; - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - if (Offset==1) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_shuffle_epi32(first,0x39); - } - else if (Offset==2) - { - first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); - first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - } - else if (Offset==3) - { - first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); - first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); - } - } -}; -template -struct palign_impl +template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) { - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset==1) - { - first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); - first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); - } - } -}; -#endif + return _mm_movemask_ps(x) != 0x0; +} EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -793,6 +1145,100 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); + __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); + __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi16(T0, T2); + kernel.packet[1] = _mm_unpackhi_epi16(T0, T2); + kernel.packet[2] = _mm_unpacklo_epi16(T1, T3); + kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + // If we number the elements in the input thus: + // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f} + // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f} + // ... + // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff}, + // + // the desired output is: + // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0} + // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1} + // ... + // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff}, + __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f + __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37 + __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f + __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57 + __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a + __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]); + __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]); + __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]); + __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]); + __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]); + __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]); + __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]); + __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]); + + __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34 + __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ... + __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ... + __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ... + __m128i s6 = _mm_unpacklo_epi16(t5, t7); + __m128i s7 = _mm_unpackhi_epi16(t5, t7); + __m128i s8 = _mm_unpacklo_epi16(t8, ta); + __m128i s9 = _mm_unpackhi_epi16(t8, ta); + __m128i sa = _mm_unpacklo_epi16(t9, tb); + __m128i sb = _mm_unpackhi_epi16(t9, tb); + __m128i sc = _mm_unpacklo_epi16(tc, te); + __m128i sd = _mm_unpackhi_epi16(tc, te); + __m128i se = _mm_unpacklo_epi16(td, tf); + __m128i sf = _mm_unpackhi_epi16(td, tf); + + __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + __m128i u2 = _mm_unpacklo_epi32(s1, s5); + __m128i u3 = _mm_unpackhi_epi32(s1, s5); + __m128i u4 = _mm_unpacklo_epi32(s2, s6); + __m128i u5 = _mm_unpackhi_epi32(s2, s6); + __m128i u6 = _mm_unpacklo_epi32(s3, s7); + __m128i u7 = _mm_unpackhi_epi32(s3, s7); + __m128i u8 = _mm_unpacklo_epi32(s8, sc); + __m128i u9 = _mm_unpackhi_epi32(s8, sc); + __m128i ua = _mm_unpacklo_epi32(s9, sd); + __m128i ub = _mm_unpackhi_epi32(s9, sd); + __m128i uc = _mm_unpacklo_epi32(sa, se); + __m128i ud = _mm_unpackhi_epi32(sa, se); + __m128i ue = _mm_unpacklo_epi32(sb, sf); + __m128i uf = _mm_unpackhi_epi32(sb, sf); + + kernel.packet[0] = _mm_unpacklo_epi64(u0, u8); + kernel.packet[1] = _mm_unpackhi_epi64(u0, u8); + kernel.packet[2] = _mm_unpacklo_epi64(u1, u9); + kernel.packet[3] = _mm_unpackhi_epi64(u1, u9); + kernel.packet[4] = _mm_unpacklo_epi64(u2, ua); + kernel.packet[5] = _mm_unpackhi_epi64(u2, ua); + kernel.packet[6] = _mm_unpacklo_epi64(u3, ub); + kernel.packet[7] = _mm_unpackhi_epi64(u3, ub); + kernel.packet[8] = _mm_unpacklo_epi64(u4, uc); + kernel.packet[9] = _mm_unpackhi_epi64(u4, uc); + kernel.packet[10] = _mm_unpacklo_epi64(u5, ud); + kernel.packet[11] = _mm_unpackhi_epi64(u5, ud); + kernel.packet[12] = _mm_unpacklo_epi64(u6, ue); + kernel.packet[13] = _mm_unpackhi_epi64(u6, ue); + kernel.packet[14] = _mm_unpacklo_epi64(u7, uf); + kernel.packet[15] = _mm_unpackhi_epi64(u7, uf); +} + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { const __m128i zero = _mm_setzero_si128(); const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); @@ -824,56 +1270,236 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons #endif } -template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),1); -#else - return _mm_move_ss(a, _mm_load_ss(&b)); +// Scalar path for pmadd with FMA to ensure consistency with vectorized path. +#ifdef EIGEN_VECTORIZE_FMA +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} #endif + + +// Packet math for Eigen::half +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; } -template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),1); -#else - return _mm_move_sd(a, _mm_load_sd(&b)); -#endif +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); } -template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) -{ -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_ps(a,pset1(b),(1<<3)); -#else - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); - return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); -#endif +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha - hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha - hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; } -template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha / hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha / hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blend_pd(a,pset1(b),(1<<1)); -#else - const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); - return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); -#endif + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; } -// Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef __FMA__ -template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { - return ::fmaf(a,b,c); +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); } -template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { - return ::fma(a,b,c); + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); } + #endif + } // end namespace internal } // end namespace Eigen +#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 +// PGI++ does not define the following intrinsics in C++ mode. +static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } +static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } +static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } +static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } +#endif + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/externals/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h index c8489323..d2a0037e 100644 --- a/externals/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/externals/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -14,6 +14,7 @@ namespace Eigen { namespace internal { +#ifndef EIGEN_VECTORIZE_AVX template <> struct type_casting_traits { enum { @@ -23,11 +24,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - template <> struct type_casting_traits { enum { @@ -37,11 +33,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - template <> struct type_casting_traits { enum { @@ -51,10 +42,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - template <> struct type_casting_traits { enum { @@ -63,12 +50,90 @@ struct type_casting_traits { TgtCoeffRatio = 2 }; }; +#endif + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { // Simply discard the second half of the input return _mm_cvtps_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return _mm_castps_si128(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return _mm_castsi128_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return _mm_castsi128_pd(a); +} + +template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return _mm_castpd_si128(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#if 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h new file mode 100644 index 00000000..b139ea2e --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SVE/MathFunctions.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_SVE_H +#define EIGEN_MATH_FUNCTIONS_SVE_H + +namespace Eigen { +namespace internal { + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pexp(const PacketXf& x) { + return pexp_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf plog(const PacketXf& x) { + return plog_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf psin(const PacketXf& x) { + return psin_float(x); +} + +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf pcos(const PacketXf& x) { + return pcos_float(x); +} + +// Hyperbolic Tangent function. +template <> +EIGEN_STRONG_INLINE EIGEN_UNUSED PacketXf ptanh(const PacketXf& x) { + return internal::generic_fast_tanh_float(x); +} +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_SVE_H diff --git a/externals/eigen/Eigen/src/Core/arch/SVE/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/SVE/PacketMath.h new file mode 100644 index 00000000..9060b372 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SVE/PacketMath.h @@ -0,0 +1,752 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_SVE_H +#define EIGEN_PACKET_MATH_SVE_H + +namespace Eigen +{ +namespace internal +{ +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct sve_packet_size_selector { + enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) }; +}; + +/********************************* int32 **************************************/ +typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXi type; + typedef PacketXi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = sve_packet_size_selector::size, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 // Not implemented in SVE + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; // Half not yet implemented + enum { + size = sve_packet_size_selector::size, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) +{ + svprfw(svptrue_b32(), addr, SV_PLDL1KEEP); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) +{ + return svdup_n_s32(from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) +{ + numext::int32_t c[packet_traits::size]; + for (int i = 0; i < packet_traits::size; i++) c[i] = i; + return svadd_s32_z(svptrue_b32(), pset1(a), svld1_s32(svptrue_b32(), c)); +} + +template <> +EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) +{ + return svadd_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) +{ + return svsub_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) +{ + return svneg_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) +{ + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) +{ + return svmul_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) +{ + return svdiv_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) +{ + return svmla_s32_z(svptrue_b32(), c, a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) +{ + return svmin_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) +{ + return svmax_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) +{ + return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) +{ + return svdup_n_s32_z(svptrue_b32(), 0xffffffffu); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) +{ + return svdup_n_s32_z(svptrue_b32(), 0); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) +{ + return svand_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) +{ + return svorr_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) +{ + return sveor_s32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) +{ + return svbic_s32_z(svptrue_b32(), a, b); +} + +template +EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) +{ + return svasrd_n_s32_z(svptrue_b32(), a, N); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) +{ + return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N))); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) +{ + return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N)); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + return svld1_gather_u32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} + return svld1_gather_u32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) +{ + EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + return svld1_gather_s32index_s32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) +{ + // svlasta returns the first element if all predicate bits are 0 + return svlasta_s32(svpfalse_b(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) +{ + return svrev_s32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) +{ + return svabs_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) +{ + return static_cast(svaddv_s32(svptrue_b32(), a)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) +{ + EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT); + + // Multiply the vector by its reverse + svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a)); + svint32_t half_prod; + + // Extract the high half of the vector. Depending on the VL more reductions need to be done + if (EIGEN_ARM64_SVE_VL >= 2048) { + half_prod = svtbl_s32(prod, svindex_u32(32, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 1024) { + half_prod = svtbl_s32(prod, svindex_u32(16, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 512) { + half_prod = svtbl_s32(prod, svindex_u32(8, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 256) { + half_prod = svtbl_s32(prod, svindex_u32(4, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + } + // Last reduction + half_prod = svtbl_s32(prod, svindex_u32(2, 1)); + prod = svmul_s32_z(svptrue_b32(), prod, half_prod); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) +{ + return svminv_s32(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) +{ + return svmaxv_s32(svptrue_b32(), a); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + int buffer[packet_traits::size * N] = {0}; + int i = 0; + + PacketXi stride_index = svindex_s32(0, N); + + for (i = 0; i < N; i++) { + svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits::size); + } +} + +/********************************* float32 ************************************/ + +typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = sve_packet_size_selector::size, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, // Not implemented in SVE + + HasDiv = 1, + HasFloor = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 0, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + + enum { + size = sve_packet_size_selector::size, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) +{ + return svdup_n_f32(from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) +{ + float c[packet_traits::size]; + for (int i = 0; i < packet_traits::size; i++) c[i] = i; + return svadd_f32_z(svptrue_b32(), pset1(a), svld1_f32(svptrue_b32(), c)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) +{ + return svadd_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) +{ + return svsub_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) +{ + return svneg_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) +{ + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) +{ + return svmul_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) +{ + return svdiv_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) +{ + return svmla_f32_z(svptrue_b32(), c, a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return svmin_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) +{ + return svminnm_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return svmax_f32_z(svptrue_b32(), a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) +{ + return svmaxnm_f32_z(svptrue_b32(), a, b); +} + +// Float comparisons in SVE return svbool (predicate). Use svdup to set active +// lanes to 1 (0xffffffffu) and inactive lanes to 0. +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu)); +} + +// Do a predicate inverse (svnot_b_z) on the predicate resulted from the +// greater/equal comparison (svcmpge_f32). Then fill a float vector with the +// active elements. +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) +{ + return svrintm_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) +{ + return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu)); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) +{ + return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b))); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + return svld1_gather_u32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) +{ + svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...} + indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...} + return svld1_gather_u32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) +{ + EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + return svld1_gather_s32index_f32(svptrue_b32(), from, indices); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) +{ + // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...} + svint32_t indices = svindex_s32(0, stride); + svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) +{ + // svlasta returns the first element if all predicate bits are 0 + return svlasta_f32(svpfalse_b(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) +{ + return svrev_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) +{ + return svabs_f32_z(svptrue_b32(), a); +} + +// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for +// all vector extensions and the generic version. +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) +{ + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) +{ + return svaddv_f32(svptrue_b32(), a); +} + +// Other reduction functions: +// mul +// Only works for SVE Vls multiple of 128 +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) +{ + EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT); + // Multiply the vector by its reverse + svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a)); + svfloat32_t half_prod; + + // Extract the high half of the vector. Depending on the VL more reductions need to be done + if (EIGEN_ARM64_SVE_VL >= 2048) { + half_prod = svtbl_f32(prod, svindex_u32(32, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 1024) { + half_prod = svtbl_f32(prod, svindex_u32(16, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 512) { + half_prod = svtbl_f32(prod, svindex_u32(8, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + if (EIGEN_ARM64_SVE_VL >= 256) { + half_prod = svtbl_f32(prod, svindex_u32(4, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + } + // Last reduction + half_prod = svtbl_f32(prod, svindex_u32(2, 1)); + prod = svmul_f32_z(svptrue_b32(), prod, half_prod); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) +{ + return svminv_f32(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) +{ + return svmaxv_f32(svptrue_b32(), a); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + float buffer[packet_traits::size * N] = {0}; + int i = 0; + + PacketXi stride_index = svindex_s32(0, N); + + for (i = 0; i < N; i++) { + svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits::size); + } +} + +template<> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) +{ + return pldexp_generic(a, exponent); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_SVE_H diff --git a/externals/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h new file mode 100644 index 00000000..7ba5d9cd --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SVE/TypeCasting.h @@ -0,0 +1,49 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_SVE_H +#define EIGEN_TYPE_CASTING_SVE_H + +namespace Eigen { +namespace internal { + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return svcvt_f32_s32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + return svcvt_s32_f32_z(svptrue_b32(), a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return svreinterpret_f32_s32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return svreinterpret_s32_f32(a); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SVE_H diff --git a/externals/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/externals/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h new file mode 100644 index 00000000..10856ff5 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -0,0 +1,232 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * InteropHeaders.h + * + * \brief: + * InteropHeaders + * + *****************************************************************/ + +#ifndef EIGEN_INTEROP_HEADERS_SYCL_H +#define EIGEN_INTEROP_HEADERS_SYCL_H + +namespace Eigen { + +#if !defined(EIGEN_DONT_VECTORIZE_SYCL) + +namespace internal { + +template +struct sycl_packet_traits : default_packet_traits { + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = lengths, + HasHalfPacket = 0, + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasSin = 1, + HasCos = 1, + HasTan = 1, + HasASin = 1, + HasACos = 1, + HasATan = 1, + HasSinh = 1, + HasCosh = 1, + HasTanh = 1, + HasLGamma = 0, + HasDiGamma = 0, + HasZeta = 0, + HasPolygamma = 0, + HasErf = 0, + HasErfc = 0, + HasNdtri = 0, + HasIGamma = 0, + HasIGammac = 0, + HasBetaInc = 0, + HasBlend = has_blend, + // This flag is used to indicate whether packet comparison is supported. + // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true. + HasCmp = 1, + HasMax = 1, + HasMin = 1, + HasMul = 1, + HasAdd = 1, + HasFloor = 1, + HasRound = 1, + HasRint = 1, + HasLog1p = 1, + HasExpm1 = 1, + HasCeil = 1, + }; +}; + +#ifdef SYCL_DEVICE_ONLY +#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \ + template <> \ + struct packet_traits \ + : sycl_packet_traits { \ + typedef packet_type type; \ + typedef packet_type half; \ + }; + +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) +#undef SYCL_PACKET_TRAITS + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#define SYCL_ARITHMETIC(packet_type) \ + template <> \ + struct is_arithmetic { \ + enum { value = true }; \ + }; +SYCL_ARITHMETIC(cl::sycl::cl_float4) +SYCL_ARITHMETIC(cl::sycl::cl_double2) +#undef SYCL_ARITHMETIC + +#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths) \ + template <> \ + struct unpacket_traits { \ + typedef unpacket_type type; \ + enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \ + typedef packet_type half; \ + }; +SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) +SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) + +#undef SYCL_UNPACKET_TRAITS +#endif + +} // end namespace internal + +#endif + +namespace TensorSycl { +namespace internal { + +template +struct PacketWrapper; +// This function should never get called on the device +#ifndef SYCL_DEVICE_ONLY +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) { + eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); + abort(); + } + EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, + Scalar) { + return ::Eigen::internal::template plset(in); + } + EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) { + eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); + abort(); + } +}; + +#elif defined(SYCL_DEVICE_ONLY) +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { + switch (index) { + case 0: + return in.x(); + case 1: + return in.y(); + case 2: + return in.z(); + case 3: + return in.w(); + default: + //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. + // The code will never reach here + __builtin_unreachable(); + } + __builtin_unreachable(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( + Scalar in, Scalar other) { + return PacketReturnType(in, other, other, other); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]); + } +}; + +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) { + return in; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, + Scalar) { + return PacketReturnType(in); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = rhs[0]; + } +}; + +template +struct PacketWrapper { + typedef typename ::Eigen::internal::unpacket_traits::type + Scalar; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { + switch (index) { + case 0: + return in.x(); + case 1: + return in.y(); + default: + //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. + // The code will never reach here + __builtin_unreachable(); + } + __builtin_unreachable(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( + Scalar in, Scalar other) { + return PacketReturnType(in, other); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { + lhs = PacketReturnType(rhs[0], rhs[1]); + } +}; + +#endif + +} // end namespace internal +} // end namespace TensorSycl +} // end namespace Eigen + +#endif // EIGEN_INTEROP_HEADERS_SYCL_H diff --git a/externals/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h new file mode 100644 index 00000000..2ab0f2a7 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -0,0 +1,301 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * MathFunctions.h + * + * \brief: + * MathFunctions + * + *****************************************************************/ + +#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H +#define EIGEN_MATH_FUNCTIONS_SYCL_H +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(SYCL_DEVICE_ONLY) +#define SYCL_PLOG(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog( \ + const packet_type& a) { \ + return cl::sycl::log(a); \ + } + +SYCL_PLOG(cl::sycl::cl_float4) +SYCL_PLOG(cl::sycl::cl_double2) +#undef SYCL_PLOG + +#define SYCL_PLOG1P(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p( \ + const packet_type& a) { \ + return cl::sycl::log1p(a); \ + } + +SYCL_PLOG1P(cl::sycl::cl_float4) +SYCL_PLOG1P(cl::sycl::cl_double2) +#undef SYCL_PLOG1P + +#define SYCL_PLOG10(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10( \ + const packet_type& a) { \ + return cl::sycl::log10(a); \ + } + +SYCL_PLOG10(cl::sycl::cl_float4) +SYCL_PLOG10(cl::sycl::cl_double2) +#undef SYCL_PLOG10 + +#define SYCL_PEXP(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp( \ + const packet_type& a) { \ + return cl::sycl::exp(a); \ + } + +SYCL_PEXP(cl::sycl::cl_float4) +SYCL_PEXP(cl::sycl::cl_float) +SYCL_PEXP(cl::sycl::cl_double2) +#undef SYCL_PEXP + +#define SYCL_PEXPM1(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1( \ + const packet_type& a) { \ + return cl::sycl::expm1(a); \ + } + +SYCL_PEXPM1(cl::sycl::cl_float4) +SYCL_PEXPM1(cl::sycl::cl_double2) +#undef SYCL_PEXPM1 + +#define SYCL_PSQRT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt( \ + const packet_type& a) { \ + return cl::sycl::sqrt(a); \ + } + +SYCL_PSQRT(cl::sycl::cl_float4) +SYCL_PSQRT(cl::sycl::cl_double2) +#undef SYCL_PSQRT + +#define SYCL_PRSQRT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt( \ + const packet_type& a) { \ + return cl::sycl::rsqrt(a); \ + } + +SYCL_PRSQRT(cl::sycl::cl_float4) +SYCL_PRSQRT(cl::sycl::cl_double2) +#undef SYCL_PRSQRT + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSIN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin( \ + const packet_type& a) { \ + return cl::sycl::sin(a); \ + } + +SYCL_PSIN(cl::sycl::cl_float4) +SYCL_PSIN(cl::sycl::cl_double2) +#undef SYCL_PSIN + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOS(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos( \ + const packet_type& a) { \ + return cl::sycl::cos(a); \ + } + +SYCL_PCOS(cl::sycl::cl_float4) +SYCL_PCOS(cl::sycl::cl_double2) +#undef SYCL_PCOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTAN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan( \ + const packet_type& a) { \ + return cl::sycl::tan(a); \ + } + +SYCL_PTAN(cl::sycl::cl_float4) +SYCL_PTAN(cl::sycl::cl_double2) +#undef SYCL_PTAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PASIN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin( \ + const packet_type& a) { \ + return cl::sycl::asin(a); \ + } + +SYCL_PASIN(cl::sycl::cl_float4) +SYCL_PASIN(cl::sycl::cl_double2) +#undef SYCL_PASIN + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PACOS(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos( \ + const packet_type& a) { \ + return cl::sycl::acos(a); \ + } + +SYCL_PACOS(cl::sycl::cl_float4) +SYCL_PACOS(cl::sycl::cl_double2) +#undef SYCL_PACOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PATAN(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan( \ + const packet_type& a) { \ + return cl::sycl::atan(a); \ + } + +SYCL_PATAN(cl::sycl::cl_float4) +SYCL_PATAN(cl::sycl::cl_double2) +#undef SYCL_PATAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSINH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh( \ + const packet_type& a) { \ + return cl::sycl::sinh(a); \ + } + +SYCL_PSINH(cl::sycl::cl_float4) +SYCL_PSINH(cl::sycl::cl_double2) +#undef SYCL_PSINH + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOSH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh( \ + const packet_type& a) { \ + return cl::sycl::cosh(a); \ + } + +SYCL_PCOSH(cl::sycl::cl_float4) +SYCL_PCOSH(cl::sycl::cl_double2) +#undef SYCL_PCOSH + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTANH(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh( \ + const packet_type& a) { \ + return cl::sycl::tanh(a); \ + } + +SYCL_PTANH(cl::sycl::cl_float4) +SYCL_PTANH(cl::sycl::cl_double2) +#undef SYCL_PTANH + +#define SYCL_PCEIL(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil( \ + const packet_type& a) { \ + return cl::sycl::ceil(a); \ + } + +SYCL_PCEIL(cl::sycl::cl_float4) +SYCL_PCEIL(cl::sycl::cl_double2) +#undef SYCL_PCEIL + +#define SYCL_PROUND(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround( \ + const packet_type& a) { \ + return cl::sycl::round(a); \ + } + +SYCL_PROUND(cl::sycl::cl_float4) +SYCL_PROUND(cl::sycl::cl_double2) +#undef SYCL_PROUND + +#define SYCL_PRINT(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print( \ + const packet_type& a) { \ + return cl::sycl::rint(a); \ + } + +SYCL_PRINT(cl::sycl::cl_float4) +SYCL_PRINT(cl::sycl::cl_double2) +#undef SYCL_PRINT + +#define SYCL_FLOOR(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor( \ + const packet_type& a) { \ + return cl::sycl::floor(a); \ + } + +SYCL_FLOOR(cl::sycl::cl_float4) +SYCL_FLOOR(cl::sycl::cl_double2) +#undef SYCL_FLOOR + +#define SYCL_PMIN(packet_type, expr) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin( \ + const packet_type& a, const packet_type& b) { \ + return expr; \ + } + +SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) +SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) +#undef SYCL_PMIN + +#define SYCL_PMAX(packet_type, expr) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax( \ + const packet_type& a, const packet_type& b) { \ + return expr; \ + } + +SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) +SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) +#undef SYCL_PMAX + +#define SYCL_PLDEXP(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp( \ + const packet_type& a, const packet_type& exponent) { \ + return cl::sycl::ldexp( \ + a, exponent.template convert()); \ + } + +SYCL_PLDEXP(cl::sycl::cl_float4) +SYCL_PLDEXP(cl::sycl::cl_double2) +#undef SYCL_PLDEXP + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_SYCL_H diff --git a/externals/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h new file mode 100644 index 00000000..87badc07 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -0,0 +1,670 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * PacketMath.h + * + * \brief: + * PacketMath + * + *****************************************************************/ + +#ifndef EIGEN_PACKET_MATH_SYCL_H +#define EIGEN_PACKET_MATH_SYCL_H +#include +namespace Eigen { + +namespace internal { +#ifdef SYCL_DEVICE_ONLY + +#define SYCL_PLOADT_RO(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + typedef typename unpacket_traits::type scalar; \ + typedef cl::sycl::multi_ptr< \ + scalar, cl::sycl::access::address_space::address_space_target> \ + multi_ptr; \ + auto res = packet_type( \ + static_cast::type>(0)); \ + res.load(0, multi_ptr(const_cast(from))); \ + return res; \ + } + +SYCL_PLOADT_RO(global_space) +SYCL_PLOADT_RO(local_space) +#undef SYCL_PLOADT_RO +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, T>& from) { + return ploadt_ro(from.get_pointer()); +} + +#ifdef SYCL_DEVICE_ONLY +#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + return ploadt_ro(from); \ + } + +// global space +SYCL_PLOAD(global_space, Unaligned, u) +SYCL_PLOAD(global_space, Aligned, ) +// local space +SYCL_PLOAD(local_space, Unaligned, u) +SYCL_PLOAD(local_space, Aligned, ) + +#undef SYCL_PLOAD +#endif + +#define SYCL_PLOAD(Alignment, AlignedType) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ + const Eigen::TensorSycl::internal::RangeAccess< \ + cl::sycl::access::mode::read_write, \ + typename unpacket_traits::type> \ + from) { \ + return ploadt_ro(from); \ + } +SYCL_PLOAD(Unaligned, u) +SYCL_PLOAD(Aligned, ) +#undef SYCL_PLOAD + +#ifdef SYCL_DEVICE_ONLY +/** \internal \returns a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +#define SYCL_PLOADT(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + if (Alignment >= unpacket_traits::alignment) \ + return pload(from); \ + else \ + return ploadu(from); \ + } + +// global space +SYCL_PLOADT(global_space) +// local space +SYCL_PLOADT(local_space) +#undef SYCL_PLOADT +#endif + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, + typename unpacket_traits::type>& from) { + return ploadt(from.get_pointer()); +} +#ifdef SYCL_DEVICE_ONLY + +// private_space +#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ + ploadt_ro( \ + const typename unpacket_traits::type* from) { \ + typedef typename unpacket_traits::type scalar; \ + auto res = packet_type(static_cast(0)); \ + res.template load( \ + 0, const_cast(from)); \ + return res; \ + } + +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) + +#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \ + const typename unpacket_traits::type* from) { \ + typedef typename unpacket_traits::type scalar; \ + auto res = packet_type(static_cast(0)); \ + res.template load( \ + 0, const_cast(from)); \ + return res; \ + } +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, ) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, ) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) + +#undef SYCL_PLOAD_SPECIAL + +#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + typename cl::sycl::multi_ptr< \ + scalar, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from) { \ + typedef cl::sycl::multi_ptr< \ + scalar, cl::sycl::access::address_space::address_space_target> \ + multi_ptr; \ + from.store(0, multi_ptr(to)); \ + } + +// global space +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) +#undef SYCL_PSTORE + +#define SYCL_PSTORE_T(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \ + typename cl::sycl::multi_ptr< \ + scalar, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from) { \ + if (Alignment) \ + pstore(to, from); \ + else \ + pstoreu(to, from); \ + } + +SYCL_PSTORE_T(global_space) + +SYCL_PSTORE_T(local_space) + +#undef SYCL_PSTORE_T + +#define SYCL_PSET1(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1( \ + const typename unpacket_traits::type& from) { \ + return packet_type(from); \ + } + +// global space +SYCL_PSET1(cl::sycl::cl_float4) +SYCL_PSET1(cl::sycl::cl_double2) + +#undef SYCL_PSET1 + +template +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type + get_ploaddup(sycl_multi_pointer) {} + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type + get_pgather(sycl_multi_pointer, Index) {} +}; + +template <> +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup( + sycl_multi_pointer from) { + return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather( + sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], + from[2 * stride], from[3 * stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( + sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) { + auto tmp = stride; + to[0] = from.x(); + to[tmp] = from.y(); + to[tmp += stride] = from.z(); + to[tmp += stride] = from.w(); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset( + const float& a) { + return cl::sycl::cl_float4(static_cast(a), static_cast(a + 1), + static_cast(a + 2), + static_cast(a + 3)); + } +}; + +template <> +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 + get_ploaddup(const sycl_multi_pointer from) { + return cl::sycl::cl_double2(from[0], from[0]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather( + const sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( + sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) { + to[0] = from.x(); + to[stride] = from.y(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset( + const double& a) { + return cl::sycl::cl_double2(static_cast(a), + static_cast(a + 1)); + } +}; + +#define SYCL_PLOAD_DUP(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from) { \ + return get_base_packet::get_ploaddup(from); \ + } + +// global space +SYCL_PLOAD_DUP(global_space) +// local_space +SYCL_PLOAD_DUP(local_space) +#undef SYCL_PLOAD_DUP + +#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ + const typename unpacket_traits::type* from) { \ + return get_base_packet::get_ploaddup(from); \ + } + +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) + +#undef SYCL_PLOAD_DUP_SPECILIZE + +#define SYCL_PLSET(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset( \ + const typename unpacket_traits::type& a) { \ + return get_base_packet::set_plset(a); \ + } + +SYCL_PLSET(cl::sycl::cl_float4) +SYCL_PLSET(cl::sycl::cl_double2) + +#undef SYCL_PLSET + +#define SYCL_PGATHER(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC inline packet_type pgather( \ + typename cl::sycl::multi_ptr< \ + const typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + from, \ + Index stride) { \ + return get_base_packet::get_pgather(from, stride); \ + } + +// global space +SYCL_PGATHER(global_space) +// local space +SYCL_PGATHER(local_space) + +#undef SYCL_PGATHER + +#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ + pgather( \ + const typename unpacket_traits::type* from, Index stride) { \ + return get_base_packet::get_pgather(from, stride); \ + } + +SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PGATHER_SPECILIZE + +#define SYCL_PSCATTER(address_space_target) \ + template \ + EIGEN_DEVICE_FUNC inline void pscatter( \ + typename cl::sycl::multi_ptr< \ + typename unpacket_traits::type, \ + cl::sycl::access::address_space::address_space_target>::pointer_t \ + to, \ + const packet_type& from, Index stride) { \ + get_base_packet::set_pscatter(to, from, stride); \ + } + +// global space +SYCL_PSCATTER(global_space) +// local space +SYCL_PSCATTER(local_space) + +#undef SYCL_PSCATTER + +#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( \ + typename unpacket_traits::type * to, \ + const packet_type& from, Index stride) { \ + get_base_packet::set_pscatter(to, from, stride); \ + } + +SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PSCATTER_SPECILIZE + +#define SYCL_PMAD(packet_type) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( \ + const packet_type& a, const packet_type& b, const packet_type& c) { \ + return cl::sycl::mad(a, b, c); \ + } + +SYCL_PMAD(cl::sycl::cl_float4) +SYCL_PMAD(cl::sycl::cl_double2) +#undef SYCL_PMAD + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst( + const cl::sycl::cl_float4& a) { + return a.x(); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst( + const cl::sycl::cl_double2& a) { + return a.x(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux( + const cl::sycl::cl_float4& a) { + return a.x() + a.y() + a.z() + a.w(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux( + const cl::sycl::cl_double2& a) { + return a.x() + a.y(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max( + const cl::sycl::cl_float4& a) { + return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), + cl::sycl::fmax(a.z(), a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max( + const cl::sycl::cl_double2& a) { + return cl::sycl::fmax(a.x(), a.y()); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min( + const cl::sycl::cl_float4& a) { + return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), + cl::sycl::fmin(a.z(), a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min( + const cl::sycl::cl_double2& a) { + return cl::sycl::fmin(a.x(), a.y()); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul( + const cl::sycl::cl_float4& a) { + return a.x() * a.y() * a.z() * a.w(); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul( + const cl::sycl::cl_double2& a) { + return a.x() * a.y(); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pabs(const cl::sycl::cl_float4& a) { + return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), + cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w())); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 +pabs(const cl::sycl::cl_double2& a) { + return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y())); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a, + const Packet &b) { + return ((a <= b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a, + const Packet &b) { + return ((a < b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a, + const Packet &b) { + return ((a == b) + .template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +#define SYCL_PCMP(OP, TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP(const TYPE &a, \ + const TYPE &b) { \ + return sycl_pcmp_##OP(a, b); \ + } + +SYCL_PCMP(le, cl::sycl::cl_float4) +SYCL_PCMP(lt, cl::sycl::cl_float4) +SYCL_PCMP(eq, cl::sycl::cl_float4) +SYCL_PCMP(le, cl::sycl::cl_double2) +SYCL_PCMP(lt, cl::sycl::cl_double2) +SYCL_PCMP(eq, cl::sycl::cl_double2) +#undef SYCL_PCMP + +template struct convert_to_integer; + +template <> struct convert_to_integer { + using type = std::int32_t; + using packet_type = cl::sycl::cl_int4; +}; +template <> struct convert_to_integer { + using type = std::int64_t; + using packet_type = cl::sycl::cl_long2; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer< + typename unpacket_traits::type>::packet_type +vector_as_int(const PacketIn &p) { + return ( + p.template convert::type>::type, + cl::sycl::rounding_mode::automatic>()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut +convert_vector(const PacketIn &p) { + return (p.template convert::type, + cl::sycl::rounding_mode::automatic>()); +} + +#define SYCL_PAND(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) & vector_as_int(b)); \ + } +SYCL_PAND(cl::sycl::cl_float4) +SYCL_PAND(cl::sycl::cl_double2) +#undef SYCL_PAND + +#define SYCL_POR(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) | vector_as_int(b)); \ + } + +SYCL_POR(cl::sycl::cl_float4) +SYCL_POR(cl::sycl::cl_double2) +#undef SYCL_POR + +#define SYCL_PXOR(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) ^ vector_as_int(b)); \ + } + +SYCL_PXOR(cl::sycl::cl_float4) +SYCL_PXOR(cl::sycl::cl_double2) +#undef SYCL_PXOR + +#define SYCL_PANDNOT(TYPE) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot(const TYPE &a, \ + const TYPE &b) { \ + return convert_vector(vector_as_int(a) & (~vector_as_int(b))); \ + } +SYCL_PANDNOT(cl::sycl::cl_float4) +SYCL_PANDNOT(cl::sycl::cl_double2) +#undef SYCL_PANDNOT + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( + PacketBlock& kernel) { + float tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; + + tmp = kernel.packet[0].z(); + kernel.packet[0].z() = kernel.packet[2].x(); + kernel.packet[2].x() = tmp; + + tmp = kernel.packet[0].w(); + kernel.packet[0].w() = kernel.packet[3].x(); + kernel.packet[3].x() = tmp; + + tmp = kernel.packet[1].z(); + kernel.packet[1].z() = kernel.packet[2].y(); + kernel.packet[2].y() = tmp; + + tmp = kernel.packet[1].w(); + kernel.packet[1].w() = kernel.packet[3].y(); + kernel.packet[3].y() = tmp; + + tmp = kernel.packet[2].w(); + kernel.packet[2].w() = kernel.packet[3].z(); + kernel.packet[3].z() = tmp; +} + +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( + PacketBlock& kernel) { + double tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend( + const Selector::size>& ifPacket, + const cl::sycl::cl_float4& thenPacket, + const cl::sycl::cl_float4& elsePacket) { + cl::sycl::cl_int4 condition( + ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, + ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +template <> +inline cl::sycl::cl_double2 pblend( + const Selector::size>& ifPacket, + const cl::sycl::cl_double2& thenPacket, + const cl::sycl::cl_double2& elsePacket) { + cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} +#endif // SYCL_DEVICE_ONLY + +#define SYCL_PSTORE(alignment) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + const Eigen::TensorSycl::internal::RangeAccess< \ + cl::sycl::access::mode::read_write, \ + typename unpacket_traits::type>& to, \ + const packet_type& from) { \ + pstore##alignment(to.get_pointer(), from); \ + } + +// global space +SYCL_PSTORE() +SYCL_PSTORE(u) + +#undef SYCL_PSTORE + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( + Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, + typename unpacket_traits::type> + to, + const packet_type& from) { + pstoret(to.get_pointer(), from); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_SYCL_H diff --git a/externals/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/externals/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h new file mode 100644 index 00000000..f81e59db --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h @@ -0,0 +1,694 @@ +/*************************************************************************** + * Copyright (C) 2017 Codeplay Software Limited + * This Source Code Form is subject to the terms of the Mozilla + * Public License v. 2.0. If a copy of the MPL was not distributed + * with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * + * SyclMemoryModel.h + * + * Description: + * Interface for SYCL buffers to behave as a non-dereferenceable pointer + * Interface for Placeholder accessor to behave as a pointer on both host + * and device + * + * Authors: + * + * Ruyman Reyes Codeplay Software Ltd. + * Mehdi Goli Codeplay Software Ltd. + * Vanya Yaneva Codeplay Software Ltd. + * + **************************************************************************/ + +#if defined(EIGEN_USE_SYCL) && \ + !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H) +#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H + +#include +#ifdef EIGEN_EXCEPTIONS +#include +#endif +#include +#include +#include +#include + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +using sycl_acc_target = cl::sycl::access::target; +using sycl_acc_mode = cl::sycl::access::mode; + +/** + * Default values for template arguments + */ +using buffer_data_type_t = uint8_t; +const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer; +const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write; + +/** + * PointerMapper + * Associates fake pointers with buffers. + * + */ +class PointerMapper { + public: + using base_ptr_t = std::intptr_t; + + /* Structure of a virtual pointer + * + * |================================================| + * | POINTER ADDRESS | + * |================================================| + */ + struct virtual_pointer_t { + /* Type for the pointers + */ + base_ptr_t m_contents; + + /** Conversions from virtual_pointer_t to + * void * should just reinterpret_cast the integer number + */ + operator void *() const { return reinterpret_cast(m_contents); } + + /** + * Convert back to the integer number. + */ + operator base_ptr_t() const { return m_contents; } + + /** + * Add a certain value to the pointer to create a + * new pointer to that offset + */ + virtual_pointer_t operator+(size_t off) { return m_contents + off; } + + /* Numerical order for sorting pointers in containers. */ + bool operator<(virtual_pointer_t rhs) const { + return (static_cast(m_contents) < + static_cast(rhs.m_contents)); + } + + bool operator>(virtual_pointer_t rhs) const { + return (static_cast(m_contents) > + static_cast(rhs.m_contents)); + } + + /** + * Numerical order for sorting pointers in containers + */ + bool operator==(virtual_pointer_t rhs) const { + return (static_cast(m_contents) == + static_cast(rhs.m_contents)); + } + + /** + * Simple forward to the equality overload. + */ + bool operator!=(virtual_pointer_t rhs) const { + return !(this->operator==(rhs)); + } + + /** + * Converts a void * into a virtual pointer structure. + * Note that this will only work if the void * was + * already a virtual_pointer_t, but we have no way of + * checking + */ + virtual_pointer_t(const void *ptr) + : m_contents(reinterpret_cast(ptr)){}; + + /** + * Creates a virtual_pointer_t from the given integer + * number + */ + virtual_pointer_t(base_ptr_t u) : m_contents(u){}; + }; + + /* Definition of a null pointer + */ + const virtual_pointer_t null_virtual_ptr = nullptr; + + /** + * Whether if a pointer is null or not. + * A pointer is nullptr if the value is of null_virtual_ptr + */ + static inline bool is_nullptr(virtual_pointer_t ptr) { + return (static_cast(ptr) == nullptr); + } + + /* basic type for all buffers + */ + using buffer_t = cl::sycl::buffer_mem; + + /** + * Node that stores information about a device allocation. + * Nodes are sorted by size to organise a free list of nodes + * that can be recovered. + */ + struct pMapNode_t { + buffer_t m_buffer; + size_t m_size; + bool m_free; + + pMapNode_t(buffer_t b, size_t size, bool f) + : m_buffer{b}, m_size{size}, m_free{f} { + m_buffer.set_final_data(nullptr); + } + + bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); } + }; + + /** Storage of the pointer / buffer tree + */ + using pointerMap_t = std::map; + + /** + * Obtain the insertion point in the pointer map for + * a pointer of the given size. + * \param requiredSize Size attemted to reclaim + */ + typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) { + typename pointerMap_t::iterator retVal; + bool reuse = false; + if (!m_freeList.empty()) { + // try to re-use an existing block + for (auto freeElem : m_freeList) { + if (freeElem->second.m_size >= requiredSize) { + retVal = freeElem; + reuse = true; + // Element is not going to be free anymore + m_freeList.erase(freeElem); + break; + } + } + } + if (!reuse) { + retVal = std::prev(m_pointerMap.end()); + } + return retVal; + } + + /** + * Returns an iterator to the node that stores the information + * of the given virtual pointer from the given pointer map structure. + * If pointer is not found, throws std::out_of_range. + * If the pointer map structure is empty, throws std::out_of_range + * + * \param pMap the pointerMap_t structure storing all the pointers + * \param virtual_pointer_ptr The virtual pointer to obtain the node of + * \throws std::out:of_range if the pointer is not found or pMap is empty + */ + typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) { + if (this->count() == 0) { + m_pointerMap.clear(); + EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n")); + + } + if (is_nullptr(ptr)) { + m_pointerMap.clear(); + EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n")); + } + // The previous element to the lower bound is the node that + // holds this memory address + auto node = m_pointerMap.lower_bound(ptr); + // If the value of the pointer is not the one of the node + // then we return the previous one + if (node == std::end(m_pointerMap)) { + --node; + } else if (node->first != ptr) { + if (node == std::begin(m_pointerMap)) { + m_pointerMap.clear(); + EIGEN_THROW_X( + std::out_of_range("The pointer is not registered in the map\n")); + + } + --node; + } + + return node; + } + + /* get_buffer. + * Returns a buffer from the map using the pointer address + */ + template + cl::sycl::buffer get_buffer( + const virtual_pointer_t ptr) { + using sycl_buffer_t = cl::sycl::buffer; + + // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`. + // We can do this without the `buffer_mem` being a pointer, as we + // only declare member variables in the base class (`buffer_mem`) and not in + // the child class (`buffer<>). + auto node = get_node(ptr); + eigen_assert(node->first == ptr || node->first < ptr); + eigen_assert(ptr < static_cast(node->second.m_size + + node->first)); + return *(static_cast(&node->second.m_buffer)); + } + + /** + * @brief Returns an accessor to the buffer of the given virtual pointer + * @param accessMode + * @param accessTarget + * @param ptr The virtual pointer + */ + template + cl::sycl::accessor + get_access(const virtual_pointer_t ptr) { + auto buf = get_buffer(ptr); + return buf.template get_access(); + } + + /** + * @brief Returns an accessor to the buffer of the given virtual pointer + * in the given command group scope + * @param accessMode + * @param accessTarget + * @param ptr The virtual pointer + * @param cgh Reference to the command group scope + */ + template + cl::sycl::accessor + get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) { + auto buf = get_buffer(ptr); + return buf.template get_access(cgh); + } + + /* + * Returns the offset from the base address of this pointer. + */ + inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) { + // The previous element to the lower bound is the node that + // holds this memory address + auto node = get_node(ptr); + auto start = node->first; + eigen_assert(start == ptr || start < ptr); + eigen_assert(ptr < start + node->second.m_size); + return (ptr - start); + } + + /* + * Returns the number of elements by which the given pointer is offset from + * the base address. + */ + template + inline size_t get_element_offset(const virtual_pointer_t ptr) { + return get_offset(ptr) / sizeof(buffer_data_type); + } + + /** + * Constructs the PointerMapper structure. + */ + PointerMapper(base_ptr_t baseAddress = 4096) + : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} { + if (m_baseAddress == 0) { + EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n")); + } + }; + + /** + * PointerMapper cannot be copied or moved + */ + PointerMapper(const PointerMapper &) = delete; + + /** + * Empty the pointer list + */ + inline void clear() { + m_freeList.clear(); + m_pointerMap.clear(); + } + + /* add_pointer. + * Adds an existing pointer to the map and returns the virtual pointer id. + */ + inline virtual_pointer_t add_pointer(const buffer_t &b) { + return add_pointer_impl(b); + } + + /* add_pointer. + * Adds a pointer to the map and returns the virtual pointer id. + */ + inline virtual_pointer_t add_pointer(buffer_t &&b) { + return add_pointer_impl(b); + } + + /** + * @brief Fuses the given node with the previous nodes in the + * pointer map if they are free + * + * @param node A reference to the free node to be fused + */ + void fuse_forward(typename pointerMap_t::iterator &node) { + while (node != std::prev(m_pointerMap.end())) { + // if following node is free + // remove it and extend the current node with its size + auto fwd_node = std::next(node); + if (!fwd_node->second.m_free) { + break; + } + auto fwd_size = fwd_node->second.m_size; + m_freeList.erase(fwd_node); + m_pointerMap.erase(fwd_node); + + node->second.m_size += fwd_size; + } + } + + /** + * @brief Fuses the given node with the following nodes in the + * pointer map if they are free + * + * @param node A reference to the free node to be fused + */ + void fuse_backward(typename pointerMap_t::iterator &node) { + while (node != m_pointerMap.begin()) { + // if previous node is free, extend it + // with the size of the current one + auto prev_node = std::prev(node); + if (!prev_node->second.m_free) { + break; + } + prev_node->second.m_size += node->second.m_size; + + // remove the current node + m_freeList.erase(node); + m_pointerMap.erase(node); + + // point to the previous node + node = prev_node; + } + } + + /* remove_pointer. + * Removes the given pointer from the map. + * The pointer is allowed to be reused only if ReUse if true. + */ + template + void remove_pointer(const virtual_pointer_t ptr) { + if (is_nullptr(ptr)) { + return; + } + auto node = this->get_node(ptr); + + node->second.m_free = true; + m_freeList.emplace(node); + + // Fuse the node + // with free nodes before and after it + fuse_forward(node); + fuse_backward(node); + + // If after fusing the node is the last one + // simply remove it (since it is free) + if (node == std::prev(m_pointerMap.end())) { + m_freeList.erase(node); + m_pointerMap.erase(node); + } + } + + /* count. + * Return the number of active pointers (i.e, pointers that + * have been malloc but not freed). + */ + size_t count() const { return (m_pointerMap.size() - m_freeList.size()); } + + private: + /* add_pointer_impl. + * Adds a pointer to the map and returns the virtual pointer id. + * BufferT is either a const buffer_t& or a buffer_t&&. + */ + template + virtual_pointer_t add_pointer_impl(BufferT b) { + virtual_pointer_t retVal = nullptr; + size_t bufSize = b.get_count(); + pMapNode_t p{b, bufSize, false}; + // If this is the first pointer: + if (m_pointerMap.empty()) { + virtual_pointer_t initialVal{m_baseAddress}; + m_pointerMap.emplace(initialVal, p); + return initialVal; + } + + auto lastElemIter = get_insertion_point(bufSize); + // We are recovering an existing free node + if (lastElemIter->second.m_free) { + lastElemIter->second.m_buffer = b; + lastElemIter->second.m_free = false; + + // If the recovered node is bigger than the inserted one + // add a new free node with the remaining space + if (lastElemIter->second.m_size > bufSize) { + // create a new node with the remaining space + auto remainingSize = lastElemIter->second.m_size - bufSize; + pMapNode_t p2{b, remainingSize, true}; + + // update size of the current node + lastElemIter->second.m_size = bufSize; + + // add the new free node + auto newFreePtr = lastElemIter->first + bufSize; + auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first; + m_freeList.emplace(freeNode); + } + + retVal = lastElemIter->first; + } else { + size_t lastSize = lastElemIter->second.m_size; + retVal = lastElemIter->first + lastSize; + m_pointerMap.emplace(retVal, p); + } + return retVal; + } + + /** + * Compare two iterators to pointer map entries according to + * the size of the allocation on the device. + */ + struct SortBySize { + bool operator()(typename pointerMap_t::iterator a, + typename pointerMap_t::iterator b) const { + return ((a->first < b->first) && (a->second <= b->second)) || + ((a->first < b->first) && (b->second <= a->second)); + } + }; + + /* Maps the pointer addresses to buffer and size pairs. + */ + pointerMap_t m_pointerMap; + + /* List of free nodes available for re-using + */ + std::set m_freeList; + + /* Base address used when issuing the first virtual pointer, allows users + * to specify alignment. Cannot be zero. */ + std::intptr_t m_baseAddress; +}; + +/* remove_pointer. + * Removes the given pointer from the map. + * The pointer is allowed to be reused only if ReUse if true. + */ +template <> +inline void PointerMapper::remove_pointer(const virtual_pointer_t ptr) { + if (is_nullptr(ptr)) { + return; + } + m_pointerMap.erase(this->get_node(ptr)); +} + +/** + * Malloc-like interface to the pointer-mapper. + * Given a size, creates a byte-typed buffer and returns a + * fake pointer to keep track of it. + * \param size Size in bytes of the desired allocation + * \throw cl::sycl::exception if error while creating the buffer + */ +inline void *SYCLmalloc(size_t size, PointerMapper &pMap) { + if (size == 0) { + return nullptr; + } + // Create a generic buffer of the given size + using buffer_t = cl::sycl::buffer; + auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size})); + // Store the buffer on the global list + return static_cast(thePointer); +} + +/** + * Free-like interface to the pointer mapper. + * Given a fake-pointer created with the virtual-pointer malloc, + * destroys the buffer and remove it from the list. + * If ReUse is false, the pointer is not added to the freeList, + * it should be false only for sub-buffers. + */ +template +inline void SYCLfree(void *ptr, PointerMapper &pMap) { + pMap.template remove_pointer(ptr); +} + +/** + * Clear all the memory allocated by SYCL. + */ +template +inline void SYCLfreeAll(PointerMapper &pMap) { + pMap.clear(); +} + +template +struct RangeAccess { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef T scalar_t; + typedef scalar_t &ref_t; + typedef typename cl::sycl::global_ptr::pointer_t ptr_t; + + // the accessor type does not necessarily the same as T + typedef cl::sycl::accessor + accessor; + + typedef RangeAccess self_t; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access, + size_t offset, + std::intptr_t virtual_ptr) + : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {} + + RangeAccess(cl::sycl::buffer buff = + cl::sycl::buffer(cl::sycl::range<1>(1))) + : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {} + + // This should be only used for null constructor on the host side + RangeAccess(std::nullptr_t) : RangeAccess() {} + // This template parameter must be removed and scalar_t should be replaced + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const { + return (access_.get_pointer().get() + offset_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) { + offset_ += (offset); + return *this; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const { + return self_t(access_, offset_ + offset, virtual_ptr_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const { + return self_t(access_, offset_ - offset, virtual_ptr_); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) { + offset_ -= offset; + return *this; + } + + // THIS IS FOR NULL COMPARISON ONLY + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( + const RangeAccess &lhs, std::nullptr_t) { + return ((lhs.virtual_ptr_ == -1)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( + const RangeAccess &lhs, std::nullptr_t i) { + return !(lhs == i); + } + + // THIS IS FOR NULL COMPARISON ONLY + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( + std::nullptr_t, const RangeAccess &rhs) { + return ((rhs.virtual_ptr_ == -1)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( + std::nullptr_t i, const RangeAccess &rhs) { + return !(i == rhs); + } + // Prefix operator (Increment and return value) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() { + offset_++; + return (*this); + } + + // Postfix operator (Return value and increment) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) { + EIGEN_UNUSED_VARIABLE(i); + self_t temp_iterator(*this); + offset_++; + return temp_iterator; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const { + return (access_.get_count() - offset_); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const { + return offset_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) { + offset_ = offset; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const { + return *get_pointer(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() { + return *get_pointer(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) { + return *(get_pointer() + x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const { + return *(get_pointer() + x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const { + return reinterpret_cast(virtual_ptr_ + + (offset_ * sizeof(scalar_t))); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const { + return (virtual_ptr_ != -1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess() { + return RangeAccess(access_, offset_, virtual_ptr_); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + operator RangeAccess() const { + return RangeAccess(access_, offset_, virtual_ptr_); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( + cl::sycl::handler &cgh) const { + cgh.require(access_); + } + + private: + accessor access_; + size_t offset_; + std::intptr_t virtual_ptr_; // the location of the buffer in the map +}; + +template +struct RangeAccess : RangeAccess { + typedef RangeAccess Base; + using Base::Base; +}; + +} // namespace internal +} // namespace TensorSycl +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H diff --git a/externals/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h b/externals/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h new file mode 100644 index 00000000..9208ab21 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -0,0 +1,85 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TypeCasting.h + * + * \brief: + * TypeCasting + * + *****************************************************************/ + +#ifndef EIGEN_TYPE_CASTING_SYCL_H +#define EIGEN_TYPE_CASTING_SYCL_H + +namespace Eigen { + +namespace internal { +#ifdef SYCL_DEVICE_ONLY +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 +pcast(const cl::sycl::cl_float4& a) { + return a + .template convert(); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pcast(const cl::sycl::cl_int4& a) { + return a.template convert(); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pcast( + const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) { + auto a1 = a.template convert(); + auto b1 = b.template convert(); + return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); +} + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; +}; + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 +pcast(const cl::sycl::cl_float4& a) { + // Simply discard the second half of the input + return cl::sycl::cl_double2(a.x(), a.y()); +} + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SYCL_H diff --git a/externals/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/externals/eigen/Eigen/src/Core/arch/ZVector/Complex.h index d39d2d10..0b9b33d9 100644 --- a/externals/eigen/Eigen/src/Core/arch/ZVector/Complex.h +++ b/externals/eigen/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,6 +15,10 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +#endif + static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,10 +33,14 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; +#else + Packet4f v; +#endif }; template<> struct packet_traits > : default_packet_traits @@ -83,69 +91,33 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +/* complex first */ template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -163,27 +135,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { + Packet2d eq = vec_cmpeq (a.v, b.v); + Packet2d tmp = { eq[1], eq[0] }; + return (Packet1cd)pand(eq, tmp); +} -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -193,157 +155,157 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; + return pfirst(a); } - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; + // TODO optimize it for AltiVec + Packet1cd res = pmul(a,pconj(b)); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } -template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { - return vecs[0]; + return Packet1cd(preverse(Packet2d(x.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - ptranspose(transpose); + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} - return padd(transpose.packet[0], transpose.packet[1]); -} +/* complex follows */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - return pfirst(a); + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; return res; } - -template -struct palign_impl +#else +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) - { - // FIXME is it sure we never have to align a Packet1cd? - // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... - } -}; + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +#endif -template -struct palign_impl +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - -template<> struct conj_helper + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = pcmp_eq (a.v, b.v); + Packet2cf res; + Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] }; + Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] }; + res.v.v4f[0] = pand(eq.v4f[0], tmp1); + res.v.v4f[1] = pand(eq.v4f[1], tmp2); + return res; +} -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} -template<> struct conj_helper +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; } +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec @@ -353,11 +315,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -366,13 +323,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -386,6 +336,88 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } +#else +template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { + Packet4f eq = vec_cmpeq (a.v, b.v); + Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] }; + return (Packet2cf)pand(eq, tmp); +} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f a_re, a_im, prod, prod_im; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + + // multiply a_im * b and get the conjugate result + prod_im = a_im * b.v; + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + // permute back to a proper order + prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + + // multiply a_re * b, add prod_im + prod = pmadd(a_re, b.v, prod_im); + + return Packet2cf(prod); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + + return pfirst(prod); +} + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res = pmul(a, pconj(b)); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/externals/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h index 5c7aa725..1635e128 100644 --- a/externals/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/externals/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,6 +20,50 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#endif + static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -93,43 +137,95 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) +Packet4f pexp(const Packet4f& _x) { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] }; + emm0 = emm0 + p4i_0x7f; + emm0 = emm0 << reinterpret_cast(p4i_23); + + return pmax(pmul(y, reinterpret_cast(emm0)), _x); +#else Packet4f res; - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); + res.v4f[0] = pexp(_x.v4f[0]); + res.v4f[1] = pexp(_x.v4f[1]); return res; +#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return __builtin_s390_vfsqdb(x); + return vec_sqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = vec_sqrt(x); +#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pset1(1.0) / psqrt(x); +#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); +#endif return res; } +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); +} + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/externals/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h index 57b01fc6..1f55a90a 100644 --- a/externals/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/externals/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -10,26 +10,20 @@ #ifndef EIGEN_PACKET_MATH_ZVECTOR_H #define EIGEN_PACKET_MATH_ZVECTOR_H -#include - namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector int Packet4i; @@ -41,21 +35,30 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +// Z14 has builtin support for float vectors +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +typedef __vector float Packet4f; +#else typedef struct { Packet2d v4f[2]; } Packet4f; +#endif typedef union { - int32_t i[4]; - uint32_t ui[4]; - int64_t l[2]; - uint64_t ul[2]; + numext::int32_t i[4]; + numext::uint32_t ui[4]; + numext::int64_t l[2]; + numext::uint64_t ul[2]; double d[2]; + float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + Packet4f v4f; +#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -80,15 +83,31 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), + numext::bit_cast0x8000000000000000ull) }; + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; +#endif static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; @@ -120,9 +139,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -149,29 +168,31 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct packet_traits : default_packet_traits -{ +template <> +struct packet_traits : default_packet_traits { typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4, + size = 4, HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, - HasExp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, + HasTanh = 1, + HasErf = 1, HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -211,9 +232,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); @@ -258,82 +279,15 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -/* Helper function to simulate a vec_splat_packet4f - */ -template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { - Packet4f splat; - switch (element) { - case 0: - splat.v4f[0] = vec_splat(from.v4f[0], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 1: - splat.v4f[0] = vec_splat(from.v4f[0], 1); - splat.v4f[1] = splat.v4f[0]; - break; - case 2: - splat.v4f[0] = vec_splat(from.v4f[1], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 3: - splat.v4f[0] = vec_splat(from.v4f[1], 1); - splat.v4f[1] = splat.v4f[0]; - break; - } - return splat; + Packet vt; + vt.v4f = v; + s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; + return s; } - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) - { - switch (Offset % 4) { - case 1: - first = vec_sld(first, second, 4); break; - case 2: - first = vec_sld(first, second, 8); break; - case 3: - first = vec_sld(first, second, 12); break; - } - } -}; - -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - - -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) - { - if (Offset == 1) - first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); - } -}; +#endif template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { @@ -344,16 +298,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet4f vfrom; - vfrom.v4f[0] = vec_ld2f(&from[0]); - vfrom.v4f[1] = vec_ld2f(&from[2]); - return vfrom; -} - template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -372,15 +316,6 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - vec_st2f(from.v4f[0], &to[0]); - vec_st2f(from.v4f[1], &to[2]); -} - - template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -397,13 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - Packet4f to; - to.v4f[0] = pset1(static_cast(from)); - to.v4f[1] = to.v4f[0]; - return to; -} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -416,17 +344,6 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat_packet4f<0>(a3); - a1 = vec_splat_packet4f<1>(a3); - a2 = vec_splat_packet4f<2>(a3); - a3 = vec_splat_packet4f<3>(a3); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -449,16 +366,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -477,16 +384,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - pstore((float *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -496,160 +393,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] + b.v4f[0]; - c.v4f[1] = a.v4f[1] + b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] - b.v4f[0]; - c.v4f[1] = a.v4f[1] - b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] * b.v4f[0]; - c.v4f[1] = a.v4f[1] * b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] / b.v4f[0]; - c.v4f[1] = a.v4f[1] / b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - Packet4f c; - c.v4f[0] = -a.v4f[0]; - c.v4f[1] = -a.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - Packet4f res; - res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); - res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); - res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); - return res; -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_round(a.v4f[0]); - res.v4f[1] = vec_round(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_ceil(a.v4f[0]); - res.v4f[1] = vec_ceil(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_floor(a.v4f[0]); - res.v4f[1] = vec_floor(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -659,14 +448,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - p.v4f[1] = vec_splat(p.v4f[0], 1); - p.v4f[0] = vec_splat(p.v4f[0], 0); - return p; -} - template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -674,15 +455,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -695,23 +473,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - Packet4f rev; - rev.v4f[0] = preverse(a.v4f[1]); - rev.v4f[1] = preverse(a.v4f[0]); - return rev; -} - template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = pabs(a.v4f[0]); - res.v4f[1] = pabs(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -730,71 +493,10 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet2d sum; - sum = padd(a.v4f[0], a.v4f[1]); - double first = predux(sum); - return static_cast(first); -} - -template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -{ - Packet4i v[4], sum[4]; - - // It's easier and faster to transpose then add as columns - // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation - // Do the transpose, first set of moves - v[0] = vec_mergeh(vecs[0], vecs[2]); - v[1] = vec_mergel(vecs[0], vecs[2]); - v[2] = vec_mergeh(vecs[1], vecs[3]); - v[3] = vec_mergel(vecs[1], vecs[3]); - // Get the resulting vectors - sum[0] = vec_mergeh(v[0], v[2]); - sum[1] = vec_mergel(v[0], v[2]); - sum[2] = vec_mergeh(v[1], v[3]); - sum[3] = vec_mergel(v[1], v[3]); - - // Now do the summation: - // Lines 0+1 - sum[0] = padd(sum[0], sum[1]); - // Lines 2+3 - sum[1] = padd(sum[2], sum[3]); - // Add the results - sum[0] = padd(sum[0], sum[1]); - - return sum[0]; -} - -template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) -{ - Packet2d v[2], sum; - v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); - v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); - - sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); - - return sum; -} - -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - transpose.packet[2] = vecs[2]; - transpose.packet[3] = vecs[3]; - ptranspose(transpose); - - Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); - sum = padd(sum, transpose.packet[2]); - sum = padd(sum, transpose.packet[3]); - return sum; -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) { EIGEN_ALIGN16 int aux[4]; pstore(aux, a); @@ -806,12 +508,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - // Return predux_mul of the subvectors product - return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); -} - // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -826,14 +522,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet2d b, res; - b = pmin(a.v4f[0], a.v4f[1]); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -849,14 +537,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet2d b, res; - b = pmax(a.v4f[0], a.v4f[1]); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -877,6 +557,282 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +/* z13 has no vector float support so we emulate that with double + z14 has proper vector float support. +*/ +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = por(a.v4f[0], b.v4f[0]); + res.v4f[1] = por(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pxor(a.v4f[0], b.v4f[0]); + res.v4f[1] = pxor(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -915,12 +871,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -932,12 +882,177 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons return result; } -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); +template<> Packet4f EIGEN_STRONG_INLINE pcmp_le(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]); + return res; +} + +template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]); + return res; +} + +template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]); + res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]); + return res; +} + +#else +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore((float*)af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +#endif + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/externals/eigen/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877..bf64ef4e 100644 --- a/externals/eigen/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/externals/eigen/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else @@ -157,7 +157,16 @@ template struct functor_traits > { enum { Cost = 3 * NumTraits::ReadCost, - PacketAccess = packet_traits::Vectorizable + PacketAccess = + #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__)) + // This is a partial workaround for a bug in clang generating bad code + // when mixing 256/512 bits loads and 128 bits moves. + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684 + // https://bugs.llvm.org/show_bug.cgi?id=40815 + 0 + #else + packet_traits::Vectorizable + #endif }; }; diff --git a/externals/eigen/Eigen/src/Core/functors/BinaryFunctors.h b/externals/eigen/Eigen/src/Core/functors/BinaryFunctors.h index 96747bac..63f09ab9 100644 --- a/externals/eigen/Eigen/src/Core/functors/BinaryFunctors.h +++ b/externals/eigen/Eigen/src/Core/functors/BinaryFunctors.h @@ -39,32 +39,26 @@ struct scalar_sum_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::padd(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd // TODO vectorize mixed sum }; }; -/** \internal - * \brief Template specialization to deprecate the summation of boolean expressions. - * This is required to solve Bug 426. - * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() - */ -template<> struct scalar_sum_op : scalar_sum_op { - EIGEN_DEPRECATED - scalar_sum_op() {} -}; + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op::operator() (const bool& a, const bool& b) const { return a || b; } /** \internal @@ -83,23 +77,27 @@ struct scalar_product_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const { return internal::predux_mul(a); } }; template struct functor_traits > { enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! + Cost = (int(NumTraits::MulCost) + int(NumTraits::MulCost))/2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul // TODO vectorize mixed product }; }; +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op::operator() (const bool& a, const bool& b) const { return a && b; } + + /** \internal * \brief Template functor to compute the conjugate product of two scalars * @@ -116,11 +114,11 @@ struct scalar_conj_product_op : binary_op_base typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } }; template @@ -136,21 +134,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff() */ -template +template struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmin(a, b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmin(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmin(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_min(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_min(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMin @@ -162,21 +167,28 @@ struct functor_traits > { * * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff() */ -template -struct scalar_max_op : binary_op_base +template +struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmax(a,b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + { + return internal::pmax(a,b); + } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_max(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + { + return internal::predux_max(a); + } }; -template -struct functor_traits > { + +template +struct functor_traits > { enum { Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, PacketAccess = internal::is_same::value && packet_traits::HasMax @@ -253,9 +265,8 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) -// typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { - EIGEN_USING_STD_MATH(sqrt) - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - return p * sqrt(Scalar(1) + qp*qp); + // This functor is used by hypotNorm only for which it is faster to first apply abs + // on all coefficients prior to reduction through hypot. + // This way we avoid calling abs on positive and real entries, and this also permits + // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes + // through the same functor... + return internal::positive_real_hypot(x,y); } }; template @@ -294,6 +298,7 @@ struct functor_traits > { /** \internal * \brief Template functor to compute the pow of two scalars + * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow */ template struct scalar_pow_op : binary_op_base @@ -308,16 +313,31 @@ struct scalar_pow_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif + EIGEN_DEVICE_FUNC inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { + return generic_pow(a,b); + } }; + template struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = (!NumTraits::IsComplex && !NumTraits::IsInteger && + packet_traits::HasExp && packet_traits::HasLog && + packet_traits::HasRound && packet_traits::HasCmp && + // Temporarly disable packet access for half/bfloat16 until + // accuracy is improved. + !is_same::value && !is_same::value + ) + }; }; - - //---------- non associative binary functors ---------- /** \internal @@ -344,7 +364,7 @@ struct scalar_difference_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub }; }; @@ -389,11 +409,14 @@ struct functor_traits > { struct scalar_boolean_and_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pand(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -405,11 +428,14 @@ template<> struct functor_traits { struct scalar_boolean_or_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::por(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true }; }; @@ -421,11 +447,44 @@ template<> struct functor_traits { struct scalar_boolean_xor_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pxor(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = false + PacketAccess = true + }; +}; + +/** \internal + * \brief Template functor to compute the absolute difference of two scalars + * + * \sa class CwiseBinaryOp, MatrixBase::absolute_difference + */ +template +struct scalar_absolute_difference_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) +#else + scalar_absolute_difference_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + { return numext::absdiff(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pabsdiff(a,b); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = is_same::value && packet_traits::HasAbsDiff }; }; @@ -443,7 +502,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -462,7 +521,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/externals/eigen/Eigen/src/Core/functors/NullaryFunctors.h b/externals/eigen/Eigen/src/Core/functors/NullaryFunctors.h index 6a30466f..192f225d 100644 --- a/externals/eigen/Eigen/src/Core/functors/NullaryFunctors.h +++ b/externals/eigen/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,45 +37,46 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), - m_interPacket(plset(0)), + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { if(m_flip) - return (i==0)? m_low : (m_high - (m_size1-i)*m_step); + return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + i*m_step); + return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) if(m_flip) { - Packet pi = padd(pset1(Scalar(i-m_size1)),m_interPacket); + Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if(i==0) - res = pinsertfirst(res, m_low); - return res; + if (EIGEN_PREDICT_TRUE(i != 0)) return res; + Packet mask = pcmp_lt(pset1(0), plset(0)); + return pselect(mask, res, pset1(m_low)); } else { - Packet pi = padd(pset1(Scalar(i)),m_interPacket); + Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if(i==m_size1-unpacket_traits::size+1) - res = pinsertlast(res, m_high); - return res; + if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits::size+1)) return res; + Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size-1)); + return pselect(mask, res, pset1(m_high)); } } @@ -83,14 +84,13 @@ struct linspaced_op_impl const Scalar m_high; const Index m_size1; const Scalar m_step; - const Packet m_interPacket; const bool m_flip; }; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_multiplier((high-low)/convert_index(num_steps<=1 ? 1 : num_steps-1)), m_divisor(convert_index((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))), @@ -116,8 +116,8 @@ struct linspaced_op_impl // Forward declaration (we default to random access which does not really give // us a speed gain when using packet access but it allows to use the functor in // nested expressions). -template struct linspaced_op; -template struct functor_traits< linspaced_op > +template struct linspaced_op; +template struct functor_traits< linspaced_op > { enum { @@ -127,9 +127,9 @@ template struct functor_traits< linspaced IsRepeatable = true }; }; -template struct linspaced_op +template struct linspaced_op { - linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low),high,num_steps) {} @@ -137,11 +137,11 @@ template struct linspaced_op EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp(i); } // This proxy object handles the actual required temporaries and the different // implementations (integer vs. floating point). - const linspaced_op_impl::IsInteger> impl; + const linspaced_op_impl::IsInteger> impl; }; // Linear access is automatically determined from the operator() prototypes available for the given functor. @@ -167,12 +167,12 @@ struct has_unary_operator,IndexType> { enum { value = template struct has_binary_operator,IndexType> { enum { value = 1}; }; -template -struct has_nullary_operator,IndexType> { enum { value = 0}; }; -template -struct has_unary_operator,IndexType> { enum { value = 1}; }; -template -struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; template struct has_nullary_operator,IndexType> { enum { value = 1}; }; diff --git a/externals/eigen/Eigen/src/Core/functors/StlFunctors.h b/externals/eigen/Eigen/src/Core/functors/StlFunctors.h index 6df3fa50..4570c9b6 100644 --- a/externals/eigen/Eigen/src/Core/functors/StlFunctors.h +++ b/externals/eigen/Eigen/src/Core/functors/StlFunctors.h @@ -12,6 +12,28 @@ namespace Eigen { +// Portable replacements for certain functors. +namespace numext { + +template +struct equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +template +struct not_equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +} + + namespace internal { // default functor traits for STL functors: @@ -68,11 +90,19 @@ template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; -#if (__cplusplus < 201103L) && (EIGEN_COMP_MSVC <= 1900) +template +struct functor_traits > + : functor_traits > {}; + +#if (EIGEN_COMP_CXXVER < 11) // std::binder* are deprecated since c++11 and will be removed in c++17 template struct functor_traits > @@ -83,13 +113,17 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif +#if (EIGEN_COMP_CXXVER < 17) +// std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +// std::binary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +#endif #ifdef EIGEN_STDEXT_SUPPORT diff --git a/externals/eigen/Eigen/src/Core/functors/UnaryFunctors.h b/externals/eigen/Eigen/src/Core/functors/UnaryFunctors.h index 2e6a00ff..16136d18 100644 --- a/externals/eigen/Eigen/src/Core/functors/UnaryFunctors.h +++ b/externals/eigen/Eigen/src/Core/functors/UnaryFunctors.h @@ -109,7 +109,7 @@ struct functor_traits > template struct scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } }; @@ -117,7 +117,15 @@ template struct functor_traits > { enum { - Cost = NumTraits::IsComplex ? NumTraits::AddCost : 0, + Cost = 0, + // Yes the cost is zero even for complexes because in most cases for which + // the cost is used, conjugation turns to be a no-op. Some examples: + // cost(a*conj(b)) == cost(a*b) + // cost(a+conj(b)) == cost(a+b) + // ::HasConj }; }; @@ -130,7 +138,7 @@ struct functor_traits > template struct scalar_arg_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::parg(a); } @@ -158,6 +166,44 @@ template struct functor_traits > { enum { Cost = is_same::value ? 0 : NumTraits::AddCost, PacketAccess = false }; }; +/** \internal + * \brief Template functor to arithmetically shift a scalar right by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_right() + */ +template +struct scalar_shift_right_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a >> N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::parithmetic_shift_right(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + +/** \internal + * \brief Template functor to logically shift a scalar left by a number of bits + * + * \sa class CwiseUnaryOp, MatrixBase::shift_left() + */ +template +struct scalar_shift_left_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op) + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const + { return a << N; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::plogical_shift_left(a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; + /** \internal * \brief Template functor to extract the real part of a complex * @@ -262,6 +308,26 @@ struct functor_traits > { }; }; +/** \internal + * + * \brief Template functor to compute the exponential of a scalar - 1. + * + * \sa class CwiseUnaryOp, ArrayBase::expm1() + */ +template struct scalar_expm1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExpm1, + Cost = functor_traits >::Cost // TODO measure cost of expm1 + }; +}; + /** \internal * * \brief Template functor to compute the logarithm of a scalar @@ -321,7 +387,7 @@ struct functor_traits > { */ template struct scalar_log10_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); } }; @@ -329,6 +395,22 @@ template struct functor_traits > { enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog10 }; }; +/** \internal + * + * \brief Template functor to compute the base-2 logarithm of a scalar + * + * \sa class CwiseUnaryOp, Cwise::log2() + */ +template struct scalar_log2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log2_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(EIGEN_LOG2E) * numext::log(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog2(a); } +}; +template +struct functor_traits > +{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog }; }; + /** \internal * \brief Template functor to compute the square root of a scalar * \sa class CwiseUnaryOp, Cwise::sqrt() @@ -356,13 +438,25 @@ struct functor_traits > { }; }; +// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings. +template<> struct scalar_sqrt_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return a; } +}; +template <> +struct functor_traits > { + enum { Cost = 1, PacketAccess = packet_traits::Vectorizable }; +}; + /** \internal * \brief Template functor to compute the reciprocal square root of a scalar * \sa class CwiseUnaryOp, Cwise::rsqrt() */ template struct scalar_rsqrt_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::rsqrt(a); } template EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } }; @@ -528,6 +622,23 @@ struct functor_traits > { }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the atanh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::atanh() + */ +template +struct scalar_atanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -547,6 +658,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the asinh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::asinh() + */ +template +struct scalar_asinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -566,6 +694,23 @@ struct functor_traits > }; }; +#if EIGEN_HAS_CXX11_MATH +/** \internal + * \brief Template functor to compute the acosh of a scalar + * \sa class CwiseUnaryOp, ArrayBase::acosh() + */ +template +struct scalar_acosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } +}; + +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +#endif + /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -578,9 +723,13 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1(Scalar(1)),a); } }; -template -struct functor_traits > -{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasDiv, + Cost = scalar_div_cost::value + }; +}; /** \internal * \brief Template functor to compute the square of a scalar @@ -598,6 +747,19 @@ template struct functor_traits > { enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_square_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the cube of a scalar * \sa class CwiseUnaryOp, Cwise::cube() @@ -614,6 +776,19 @@ template struct functor_traits > { enum { Cost = 2*NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<> +struct scalar_cube_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator() (const bool& a) const { return a; } + template + EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return a; } +}; +template<> +struct functor_traits > +{ enum { Cost = 0, PacketAccess = packet_traits::Vectorizable }; }; + /** \internal * \brief Template functor to compute the rounded value of a scalar * \sa class CwiseUnaryOp, ArrayBase::round() @@ -652,6 +827,25 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the rounded (with current rounding mode) value of a scalar + * \sa class CwiseUnaryOp, ArrayBase::rint() + */ +template struct scalar_rint_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasRint + }; +}; + /** \internal * \brief Template functor to compute the ceil of a scalar * \sa class CwiseUnaryOp, ArrayBase::ceil() @@ -678,7 +872,13 @@ struct functor_traits > template struct scalar_isnan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isnan(a); +#else + return (numext::isnan)(a); +#endif + } }; template struct functor_traits > @@ -696,7 +896,13 @@ struct functor_traits > template struct scalar_isinf_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isinf(a); +#else + return (numext::isinf)(a); +#endif + } }; template struct functor_traits > @@ -714,7 +920,13 @@ struct functor_traits > template struct scalar_isfinite_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { +#if defined(SYCL_DEVICE_ONLY) + return numext::isfinite(a); +#else + return (numext::isfinite)(a); +#endif + } }; template struct functor_traits > @@ -746,9 +958,9 @@ struct functor_traits > { * \brief Template functor to compute the signum of a scalar * \sa class CwiseUnaryOp, Cwise::sign() */ -template::IsComplex!=0) > struct scalar_sign_op; +template::IsComplex!=0), bool is_integer=(NumTraits::IsInteger!=0) > struct scalar_sign_op; template -struct scalar_sign_op { +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -758,8 +970,21 @@ struct scalar_sign_op { //template //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } }; + template -struct scalar_sign_op { +struct scalar_sign_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + return (numext::isnan)(a) ? a : Scalar( (a>Scalar(0)) - (a + //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); } +}; + +template +struct scalar_sign_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { @@ -768,7 +993,7 @@ struct scalar_sign_op { if (aa==real_type(0)) return Scalar(0); aa = real_type(1)/aa; - return Scalar(real(a)*aa, imag(a)*aa ); + return Scalar(a.real()*aa, a.imag()*aa ); } //TODO //template @@ -777,7 +1002,7 @@ struct scalar_sign_op { template struct functor_traits > { enum { - Cost = + Cost = NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), @@ -785,6 +1010,120 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the logistic function of a scalar + * \sa class CwiseUnaryOp, ArrayBase::logistic() + */ +template +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +#ifndef EIGEN_GPU_COMPILE_PHASE +/** \internal + * \brief Template specialization of the logistic function for float. + * + * Uses just a 9/10-degree rational interpolant which + * interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range + * [-9, 18]. Below -9 we use the more accurate approximation + * 1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing + * one ulp. The shifted logistic is interpolated because it was easier to + * make the fit converge. + * + */ +template <> +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { + return packetOp(x); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& _x) const { + const Packet cutoff_lower = pset1(-9.f); + const Packet lt_mask = pcmp_lt(_x, cutoff_lower); + const bool any_small = predux_any(lt_mask); + + // The upper cut-off is the smallest x for which the rational approximation evaluates to 1. + // Choosing this value saves us a few instructions clamping the results at the end. +#ifdef EIGEN_VECTORIZE_FMA + const Packet cutoff_upper = pset1(15.7243833541870117f); +#else + const Packet cutoff_upper = pset1(15.6437711715698242f); +#endif + const Packet x = pmin(_x, cutoff_upper); + + // The monomial coefficients of the numerator polynomial (odd). + const Packet alpha_1 = pset1(2.48287947061529e-01f); + const Packet alpha_3 = pset1(8.51377133304701e-03f); + const Packet alpha_5 = pset1(6.08574864600143e-05f); + const Packet alpha_7 = pset1(1.15627324459942e-07f); + const Packet alpha_9 = pset1(4.37031012579801e-11f); + + // The monomial coefficients of the denominator polynomial (even). + const Packet beta_0 = pset1(9.93151921023180e-01f); + const Packet beta_2 = pset1(1.16817656904453e-01f); + const Packet beta_4 = pset1(1.70198817374094e-03f); + const Packet beta_6 = pset1(6.29106785017040e-06f); + const Packet beta_8 = pset1(5.76102136993427e-09f); + const Packet beta_10 = pset1(6.10247389755681e-13f); + + // Since the polynomials are odd/even, we need x^2. + const Packet x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet p = pmadd(x2, alpha_9, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet q = pmadd(x2, beta_10, beta_8); + q = pmadd(x2, q, beta_6); + q = pmadd(x2, q, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + // Divide the numerator by the denominator and shift it up. + const Packet logistic = padd(pdiv(p, q), pset1(0.5f)); + if (EIGEN_PREDICT_FALSE(any_small)) { + const Packet exponential = pexp(_x); + return pselect(lt_mask, exponential, logistic); + } else { + return logistic; + } + } +}; +#endif // #ifndef EIGEN_GPU_COMPILE_PHASE + +template +struct functor_traits > { + enum { + // The cost estimate for float here here is for the common(?) case where + // all arguments are greater than -9. + Cost = scalar_div_cost::HasDiv>::value + + (internal::is_same::value + ? NumTraits::AddCost * 15 + NumTraits::MulCost * 11 + : NumTraits::AddCost * 2 + + functor_traits >::Cost), + PacketAccess = + packet_traits::HasAdd && packet_traits::HasDiv && + (internal::is_same::value + ? packet_traits::HasMul && packet_traits::HasMax && + packet_traits::HasMin + : packet_traits::HasNegate && packet_traits::HasExp) + }; +}; + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/externals/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce..f35b760c 100644 --- a/externals/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/externals/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template +enum GEBPPacketSizeType { + GEBPPacketFull = 0, + GEBPPacketHalf, + GEBPPacketQuarter +}; + +template class gebp_traits; @@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } +#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) + #if EIGEN_ARCH_i386_OR_x86_64 -const std::ptrdiff_t defaultL1CacheSize = 32*1024; -const std::ptrdiff_t defaultL2CacheSize = 256*1024; -const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); +#elif EIGEN_ARCH_PPC +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); #else -const std::ptrdiff_t defaultL1CacheSize = 16*1024; -const std::ptrdiff_t defaultL2CacheSize = 512*1024; -const std::ptrdiff_t defaultL3CacheSize = 512*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); #endif +#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE + /** \internal */ struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { @@ -50,7 +82,6 @@ struct CacheSizes { std::ptrdiff_t m_l3; }; - /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { @@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -115,7 +156,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi(kr, (numext::mini)((l1-ksub)/kdiv, 320)); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -307,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - - // FIXME (a bit overkill maybe ?) - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; +template +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; +}; - template - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector::run(cj,a,b,c,t); - } +template +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif +template +struct packet_conditional { typedef T3 type; }; + +template +struct packet_conditional { typedef T1 type; }; + +template +struct packet_conditional { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + ScalarPacket /* Vectorization logic * real*real: unpack rhs to constant packets, ... @@ -347,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -355,13 +422,17 @@ class gebp_traits typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -370,10 +441,12 @@ class gebp_traits // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) - // we assume 16 registers +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -383,37 +456,41 @@ class gebp_traits RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -431,8 +508,8 @@ class gebp_traits dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -447,6 +524,12 @@ class gebp_traits #endif } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -460,21 +543,25 @@ class gebp_traits }; -template -class gebp_traits, RealScalar, _ConjLhs, false> +template +class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -489,13 +576,12 @@ class gebp_traits, RealScalar, _ConjLhs, false> RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -504,42 +590,64 @@ class gebp_traits, RealScalar, _ConjLhs, false> p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1(*b); + loadRhsQuad_impl(b,dest, typename conditional::type()); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { - dest = pload(a); + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad(tmp); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { - dest = ploadu(a); + eigen_internal_assert(RhsPacketSize<=8); + dest = pset1(*b); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + dest = pload(a); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu(a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -554,13 +662,20 @@ class gebp_traits, RealScalar, _ConjLhs, false> c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper cj; }; template @@ -579,13 +694,57 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template -const DoublePacket& predux_downto4(const DoublePacket &a) +const DoublePacket& +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size<=8>::type* = 0) { return a; } -template struct unpacket_traits > { typedef DoublePacket half; }; +template +DoublePacket::half> +predux_half_dowto4(const DoublePacket &a, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket::half> res; + typedef std::complex::type> Cplx; + typedef typename packet_traits::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size<=8>::type* = 0) +{ + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); +} + +template +void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, + typename enable_if::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits::Real RealScalar; + RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; + RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; + dest.first = ploadquad(r); + dest.second = ploadquad(i); +} + + +template struct unpacket_traits > { + typedef DoublePacket::half> half; +}; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -595,8 +754,8 @@ template struct unpacket_traits > { typede // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -604,15 +763,21 @@ class gebp_traits, std::complex, _ConjLhs, typedef std::complex RhsScalar; typedef std::complex ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits::vectorizable + && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -622,14 +787,16 @@ class gebp_traits, std::complex, _ConjLhs, RhsProgress = 1 }; - typedef typename packet_traits::type RealPacket; - typedef typename packet_traits::type ScalarPacket; - typedef DoublePacket DoublePacketType; + typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -640,51 +807,49 @@ class gebp_traits, std::complex, _ConjLhs, } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -693,47 +858,59 @@ class gebp_traits, std::complex, _ConjLhs, dest = pload((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE + typename enable_if::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -743,8 +920,8 @@ class gebp_traits, std::complex, _ConjLhs, conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs > +template +class gebp_traits, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex Scalar; @@ -752,14 +929,25 @@ class gebp_traits, false, _ConjRhs > typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits::Vectorizable - && packet_traits::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -770,14 +958,11 @@ class gebp_traits, false, _ConjRhs > RhsProgress = 1 }; - typedef typename packet_traits::type _LhsPacket; - typedef typename packet_traits::type _RhsPacket; - typedef typename packet_traits::type _ResPacket; - typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -785,22 +970,25 @@ class gebp_traits, false, _ConjRhs > p = pset1(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); + dest = pset1(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1(b+0); -// b1 = pload1(b+1); -// } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -809,21 +997,23 @@ class gebp_traits, false, _ConjRhs > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits::size<=4); - loadRhs(b,dest); + dest = ploadquad(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -839,16 +1029,24 @@ class gebp_traits, false, _ConjRhs > c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper cj; + }; -/* optimized GEneral packed Block * packed Panel product kernel +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -858,26 +1056,47 @@ class gebp_traits, false, _ConjRhs > template struct gebp_kernel { - typedef gebp_traits Traits; + typedef gebp_traits Traits; + typedef gebp_traits HalfTraits; + typedef gebp_traits QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper::type RhsPanel15; + + typedef gebp_traits SwappedTraits; - typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -887,6 +1106,299 @@ struct gebp_kernel Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; +template::LhsProgress> +struct last_row_process_16_packets +{ + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + EIGEN_UNUSED_VARIABLE(res); + EIGEN_UNUSED_VARIABLE(straits); + EIGEN_UNUSED_VARIABLE(blA); + EIGEN_UNUSED_VARIABLE(blB); + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(endk); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j2); + EIGEN_UNUSED_VARIABLE(alpha); + EIGEN_UNUSED_VARIABLE(C0); + } +}; + + +template +struct last_row_process_16_packets { + typedef gebp_traits Traits; + typedef gebp_traits SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + typedef typename unpacket_traits::half>::half SResPacketQuarter; + typedef typename unpacket_traits::half>::half SLhsPacketQuarter; + typedef typename unpacket_traits::half>::half SRhsPacketQuarter; + typedef typename unpacket_traits::half>::half SAccPacketQuarter; + + SResPacketQuarter R = res.template gatherPacket(i, j2); + SResPacketQuarter alphav = pset1(alpha); + + if (depth - endk > 0) + { + // We have to handle the last row(s) of the rhs, which + // correspond to a half-packet + SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + + for (Index kk = endk; kk < depth; kk++) + { + SLhsPacketQuarter a0; + SRhsPacketQuarter b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + straits.madd(a0,b0,c0,b0, fix<0>); + blB += SwappedTraits::LhsProgress/4; + blA += 1; + } + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); + } + res.scatterPacket(i, j2, R); + } +}; + +template +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i(alpha); + + R0 = r0.template loadPacket(0); + R1 = r1.template loadPacket(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket(0); + R1 = r3.template loadPacket(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k(alpha); + R0 = r0.template loadPacket(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template +struct lhs_process_fraction_of_packet : lhs_process_one_packet +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template EIGEN_DONT_INLINE void gebp_kernel @@ -903,10 +1415,12 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -964,36 +1478,48 @@ void gebp_kernel); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1013,7 +1539,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1035,9 +1562,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1045,9 +1572,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1055,9 +1582,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1093,20 +1620,20 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1116,8 +1643,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1195,26 +1722,34 @@ void gebp_kernel=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) - #define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while(false) - internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1234,7 +1769,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1257,10 +1793,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1305,8 +1841,8 @@ void gebp_kernel); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1319,8 +1855,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1350,186 +1886,43 @@ void gebp_kernel=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i(alpha); - - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1::half SResPacketHalf; + // If LhsProgress is 8 or 16, it assumes that there is a + // half or quarter packet, respectively, of the same size as + // nr (which is currently 4) for the return type. + const int SResPacketHalfSize = unpacket_traits::half>::size; + const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && - (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + (SwappedTraits::LhsProgress<=16) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -1552,15 +1945,15 @@ void gebp_kernel); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1573,7 +1966,7 @@ void gebp_kernel); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1583,7 +1976,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -1596,16 +1989,25 @@ void gebp_kernel); straits.acc(c0, alphav, R); } else { - straits.acc(predux_downto4(C0), alphav, R); + straits.acc(predux_half_dowto4(C0), alphav, R); } res.scatterPacket(i, j2, R); } + else if (SwappedTraits::LhsProgress==16) + { + // Special case where we have to first reduce the + // accumulation register C0. We specialize the block in + // template form, so that LhsProgress < 16 paths don't + // fail to compile + last_row_process_16_packets p; + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + } else { SResPacket R = res.template gatherPacket(i, j2); @@ -1628,14 +2030,14 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1709,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1725,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1745,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1763,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack21) { - for(; i -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1813,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i kernel; @@ -1971,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2015,94 +2501,104 @@ template { typedef typename packet_traits::type Packet; + typedef typename unpacket_traits::half HalfPacket; + typedef typename unpacket_traits::half>::half QuarterPacket; typedef typename DataMapper::LinearMapper LinearMapper; - enum { PacketSize = packet_traits::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); -}; - -template -EIGEN_DONT_INLINE void gemm_pack_rhs - ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) -{ - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - EIGEN_UNUSED_VARIABLE(stride); - EIGEN_UNUSED_VARIABLE(offset); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if::IsComplex && Conjugate> cj; - Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; - Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; - Index count = 0; - -// if(nr>=8) -// { -// for(Index j2=0; j2(&rhs[k*rhsStride + j2]); -// pstoreu(blockB+count, cj.pconj(A)); -// } else if (PacketSize==4) { -// Packet A = ploadu(&rhs[k*rhsStride + j2]); -// Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); -// pstoreu(blockB+count, cj.pconj(A)); -// pstoreu(blockB+count+PacketSize, cj.pconj(B)); -// } else { -// const Scalar* b0 = &rhs[k*rhsStride + j2]; -// blockB[count+0] = cj(b0[0]); -// blockB[count+1] = cj(b0[1]); -// blockB[count+2] = cj(b0[2]); -// blockB[count+3] = cj(b0[3]); -// blockB[count+4] = cj(b0[4]); -// blockB[count+5] = cj(b0[5]); -// blockB[count+6] = cj(b0[6]); -// blockB[count+7] = cj(b0[7]); -// } -// count += 8; -// } -// // skip what we have after -// if(PanelMode) count += 8 * (stride-offset-depth); -// } -// } - if(nr>=4) + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size}; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) { - for(Index j2=packet_cols8; j2=depth && offset<=stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + + // if(nr>=8) + // { + // for(Index j2=0; j2(&rhs[k*rhsStride + j2]); + // pstoreu(blockB+count, cj.pconj(A)); + // } else if (PacketSize==4) { + // Packet A = ploadu(&rhs[k*rhsStride + j2]); + // Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); + // pstoreu(blockB+count, cj.pconj(A)); + // pstoreu(blockB+count+PacketSize, cj.pconj(B)); + // } else { + // const Scalar* b0 = &rhs[k*rhsStride + j2]; + // blockB[count+0] = cj(b0[0]); + // blockB[count+1] = cj(b0[1]); + // blockB[count+2] = cj(b0[2]); + // blockB[count+3] = cj(b0[3]); + // blockB[count+4] = cj(b0[4]); + // blockB[count+5] = cj(b0[5]); + // blockB[count+6] = cj(b0[6]); + // blockB[count+7] = cj(b0[7]); + // } + // count += 8; + // } + // // skip what we have after + // if(PanelMode) count += 8 * (stride-offset-depth); + // } + // } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += 4 * offset; - for(Index k=0; k(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize==4) { + HalfPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize==4) { + QuarterPacket A = rhs.template loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += 4 * (stride-offset-depth); } - } - // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols4; j2 class level3_blocking; template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -30,7 +31,7 @@ struct general_matrix_matrix_product& blocking, GemmParallelInfo* info = 0) @@ -39,8 +40,8 @@ struct general_matrix_matrix_product - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + ColMajor,ResInnerStride> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info); } }; @@ -49,8 +50,9 @@ struct general_matrix_matrix_product -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -59,23 +61,23 @@ typedef typename ScalarBinaryOpTraits::ReturnType ResScala static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resStride, + ResScalar* _res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; @@ -108,7 +110,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users += threads; + info[tid].users = threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -146,7 +148,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -439,8 +449,8 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -448,8 +458,8 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } @@ -461,11 +471,25 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + if (dst.cols() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; @@ -476,7 +500,8 @@ struct generic_product_impl Index, LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); diff --git a/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7122efa6..6ba0d9bd 100644 --- a/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -25,51 +25,54 @@ namespace internal { **********************************************************************/ // forward declarations (defined at the end of this file) -template +template struct tribb_kernel; /* Optimized matrix-matrix product evaluating only one triangular half */ template + int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized> struct general_matrix_matrix_triangular_product; // as usual if the result is row major => we transpose the product template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); + ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking); } }; template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -84,10 +87,10 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; - tribb_kernel sybb; + tribb_kernel sybb; for(Index k2=0; k2 +template struct tribb_kernel { typedef gebp_traits Traits; @@ -142,11 +144,13 @@ struct tribb_kernel enum { BlockSize = meta_least_common_multiple::ret }; - void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - typedef blas_data_mapper ResMapper; - ResMapper res(_res, resStride); - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + typedef blas_data_mapper BufferMapper; + ResMapper res(_res, resStride, resIncr); + gebp_kernel gebp_kernel1; + gebp_kernel gebp_kernel2; Matrix buffer((internal::constructor_without_unaligned_array_assert())); @@ -158,31 +162,32 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0); - + gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // 2 - triangular accumulation for(Index j1=0; j1 enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, - RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 }; Index size = mat.cols(); + if(SkipDiag) + size--; Index depth = actualLhs.cols(); typedef internal::gemm_blocking_space internal::general_matrix_matrix_triangular_product + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)> ::run(size, depth, - &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), + &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0), + mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { + EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 5b7c15cc..9a650ec2 100644 --- a/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/externals/eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,10 +37,10 @@ namespace Eigen { namespace internal { -template +template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< - Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; // try to go to BLAS specialization @@ -48,19 +48,19 @@ struct general_matrix_matrix_rankupdate : template \ struct general_matrix_matrix_triangular_product { \ + Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) \ { \ - if (lhs==rhs) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + ColMajor, 1, UpLo, BuiltIn> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \ } \ } \ }; @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate \ -struct general_matrix_matrix_product \ +struct general_matrix_matrix_product \ { \ typedef gebp_traits Traits; \ \ static void run(Index rows, Index cols, Index depth, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, \ level3_blocking& /*blocking*/, \ GemmParallelInfo* /*info = 0*/) \ { \ using std::conj; \ \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char transa, transb; \ BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -100,13 +102,20 @@ static void run(Index rows, Index cols, Index depth, \ ldb = convert_index(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/externals/eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/externals/eigen/Eigen/src/Core/products/GeneralMatrixVector.h index 3c1a7fc4..dfb6aebc 100644 --- a/externals/eigen/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/externals/eigen/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2009 Gael Guennebaud +// Copyright (C) 2008-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,11 +14,57 @@ namespace Eigen { namespace internal { +enum GEMVPacketSizeType { + GEMVPacketFull = 0, + GEMVPacketHalf, + GEMVPacketQuarter +}; + +template +struct gemv_packet_cond { typedef T3 type; }; + +template +struct gemv_packet_cond { typedef T1 type; }; + +template +struct gemv_packet_cond { typedef T2 type; }; + +template +class gemv_traits +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type \ + prefix ## name ## Packet + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); +#undef PACKET_DECL_COND_PREFIX + +public: + enum { + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && + unpacket_traits<_RhsPacket>::vectorizable && + int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + }; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; +}; + + /* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -27,56 +73,30 @@ namespace internal { * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * */ template struct general_matrix_vector_product { + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load(j), ptmp0), \ - pcj.pmul(lhs1.template load(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load(j), ptmp2), \ - pcj.pmul(lhs3.template load(j), ptmp3)) ))) - - typedef typename LhsMapper::VectorMapper LhsScalars; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); conj_helper cj; conj_helper pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; + conj_helper pcj_half; + conj_helper pcj_quarter; const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; + const Index n_half = rows-1*ResPacketSizeHalf+1; + const Index n_quarter = rows-1*ResPacketSizeQuarter+1; + + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1(alpha); + ResPacketHalf palpha_half = pset1(alpha); + ResPacketQuarter palpha_quarter = pset1(alpha); + + for(Index j2=0; j2(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+LhsPacketSize*7,j),b0,c7); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu(res+i+ResPacketSize*7))); } - else + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); - /* eigen_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i(alpha*rhs(i, 0)), - ptmp1 = pset1(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1(alpha*rhs(i+2, 0)), - ptmp3 = pset1(alpha*rhs(i+offset3, 0)); + i+=ResPacketSize*4; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)); - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); - if (Vectorizable) + i+=ResPacketSize*3; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); } - - if (alignedSize>alignedStart) + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i(ResScalar(0)); + for(Index j=j2; j1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load(j); - A10 = lhs0.template load(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); } - } // end explicit vectorization - - /* process remaining coeffs (or all if there is no explicit vectorization) */ - for (Index j=alignedSize; j(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + if(HasHalf && i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj_half.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu(res+i+ResPacketSizeHalf*0))); + i+=ResPacketSizeHalf; } - } - - // process remaining first and last columns (at most columnsAtOnce-1) - Index end = cols; - Index start = columnBound; - do - { - for (Index k=start; k(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) + ResPacketQuarter c0 = pset1(ResScalar(0)); + for(Index j=j2; j(alignedStart)) - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - else - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + RhsPacketQuarter b0 = pset1(rhs(j,0)); + c0 = pcj_quarter.pmadd(lhs.template load(i+0,j),b0,c0); } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i(res+i+ResPacketSizeQuarter*0))); + i+=ResPacketSizeQuarter; } - if (skipColumns) + for(;i struct general_matrix_vector_product { -typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - -enum { - Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable - && int(packet_traits::size)==int(packet_traits::size), - LhsPacketSize = Vectorizable ? packet_traits::size : 1, - RhsPacketSize = Vectorizable ? packet_traits::size : 1, - ResPacketSize = Vectorizable ? packet_traits::size : 1 -}; + typedef gemv_traits Traits; + typedef gemv_traits HalfTraits; + typedef gemv_traits QuarterTraits; + + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits::type _LhsPacket; -typedef typename packet_traits::type _RhsPacket; -typedef typename packet_traits::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional::type LhsPacket; -typedef typename conditional::type RhsPacket; -typedef typename conditional::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + eigen_internal_assert(rhs.stride()==1); conj_helper cj; conj_helper pcj; - - typedef typename LhsMapper::VectorMapper LhsScalars; - - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + conj_helper pcj_half; + conj_helper pcj_quarter; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + LhsPacketSizeHalf = HalfTraits::LhsPacketSize, + LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + Index i=0; + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipRows = 0; + RhsPacket b0 = rhs.template load(j,0); + + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+7,j),b0,c7); } - else + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; } - else if(Vectorizable) + for(; i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); - Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; - for (Index i=skipRows; i(j,0); - if (Vectorizable) + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; j(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), - ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + RhsScalar b0 = rhs(j,0); - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; j(ResScalar(0)), + c1 = pset1(ResScalar(0)); - if (alignedSize>alignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(0); - A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); - ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j(j,0); - // process remaining coeffs (or all if no explicit vectorization) - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j(ResScalar(0)); + ResPacketHalf c0_h = pset1(ResScalar(0)); + ResPacketQuarter c0_q = pset1(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); - ResPacket ptmp0 = pset1(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; jalignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned(alignedStart)) - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - else - for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j(j,0); + c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } - if (skipRows) + ResScalar cc0 = predux(c0); + if (HasHalf) { + for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + { + RhsPacketHalf b0 = rhs.template load(j,0); + c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); + } + cc0 += predux(c0_h); + } + if (HasQuarter) { + for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) + { + RhsPacketQuarter b0 = rhs.template load(j,0); + c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); + } + cc0 += predux(c0_q); + } + for(; j \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/externals/eigen/Eigen/src/Core/products/Parallelizer.h b/externals/eigen/Eigen/src/Core/products/Parallelizer.h index c2f084c8..8f91879e 100644 --- a/externals/eigen/Eigen/src/Core/products/Parallelizer.h +++ b/externals/eigen/Eigen/src/Core/products/Parallelizer.h @@ -10,6 +10,10 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H +#if EIGEN_HAS_CXX11_ATOMIC +#include +#endif + namespace Eigen { namespace internal { @@ -17,7 +21,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads) if(action==SetAction) { @@ -75,8 +80,17 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC + std::atomic sync; + std::atomic users; +#else Index volatile sync; int volatile users; +#endif Index lhs_start; Index lhs_length; @@ -87,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types -#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redisigned anyway. + // parallelizer mechanism has to be redesigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -112,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, double work = static_cast(rows) * static_cast(cols) * static_cast(depth); double kMinTaskSize = 50000; // FIXME improve this heuristic. - pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); + pb_max_threads = std::max(1, std::min(pb_max_threads, static_cast( work / kMinTaskSize ) )); // compute the number of threads we are going to use Index threads = std::min(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index da6f82ab..33ecf10f 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits::size }; + typedef typename unpacket_traits::type>::half HalfPacket; + typedef typename unpacket_traits::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits::size, + HalfPacketSize = unpacket_traits::size, + QuarterPacketSize = unpacket_traits::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i + int ResStorageOrder, int ResInnerStride> struct product_selfadjoint_matrix; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), - ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor,ResInnerStride> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -334,11 +354,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef const_blas_data_mapper LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -352,7 +372,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -398,26 +418,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -425,9 +447,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Traits; typedef const_blas_data_mapper LhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride); + ResMapper res(_res,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -437,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), - internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> + internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime> ::run( lhs.rows(), rhs.cols(), // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking // alpha ); } diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d6..61396dbd 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,20 +40,22 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -81,25 +83,27 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -144,33 +148,41 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -197,25 +209,27 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -259,15 +273,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h index 3fd180e6..d38fd72b 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at onces that allows to both reduce + * This algorithm processes 2 columns at once that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ @@ -27,7 +27,8 @@ template -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23acc..1238345e 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointProduct.h b/externals/eigen/Eigen/src/Core/products/SelfadjointProduct.h index f038d686..a21be805 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointProduct.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointProduct.h @@ -109,10 +109,10 @@ struct selfadjoint_product_selector internal::general_matrix_matrix_triangular_product::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, - IsRowMajor ? RowMajor : ColMajor, UpLo> + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, - &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(), + mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/externals/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/externals/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae36411..f752a0bf 100644 --- a/externals/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/externals/eigen/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; @@ -79,8 +80,8 @@ ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - typedef typename internal::remove_all::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef typename internal::remove_all::type>::type UType; + typedef typename internal::remove_all::type>::type VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0..f0c60507 100644 --- a/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -45,22 +45,24 @@ template + int ResStorageOrder, int ResInnerStride, + int Version = Specialized> struct product_triangular_matrix_matrix; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version> { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_triangular_matrix_matrix - ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor, ResInnerStride> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; @@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -137,7 +141,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -145,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -216,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -229,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; enum { @@ -245,20 +256,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -270,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -284,7 +296,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -292,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; @@ -393,7 +406,9 @@ struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -405,8 +420,9 @@ struct triangular_product_impl typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -423,14 +439,29 @@ struct triangular_product_impl Mode, LhsIsTriangular, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> + (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6b..a98d12e4 100644 --- a/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/externals/eigen/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -46,7 +46,7 @@ template {}; + RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {}; // try to go to BLAS specialization @@ -55,13 +55,15 @@ template \ struct product_triangular_matrix_matrix { \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ - const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ } \ }; @@ -75,7 +77,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -172,7 +174,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +182,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -225,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -234,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -282,7 +291,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +299,17 @@ struct product_triangular_matrix_matrix_trmm struct trmv_selector typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -274,6 +275,12 @@ template struct trmv_selector else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template struct trmv_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template struct trmv_selector actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/externals/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/externals/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 07bf26ce..3d47a2b9 100644 --- a/externals/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/externals/eigen/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/externals/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/externals/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b8..6d879ba0 100644 --- a/externals/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/externals/eigen/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -15,48 +15,48 @@ namespace Eigen { namespace internal { // if the rhs is row major, let's transpose the product -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static void run( Index size, Index cols, const Scalar* tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { triangular_solve_matrix< Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), NumTraits::IsComplex && Conjugate, - TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> - ::run(size, cols, tri, triStride, _other, otherStride, blocking); + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride> + ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking); } }; /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left */ -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index cols = otherSize; typedef const_blas_data_mapper TriMapper; - typedef blas_data_mapper OtherMapper; + typedef blas_data_mapper OtherMapper; TriMapper tri(_tri, triStride); - OtherMapper other(_other, otherStride); + OtherMapper other(_other, otherStride, otherIncr); typedef gebp_traits Traits; @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -128,19 +128,21 @@ EIGEN_DONT_INLINE void triangular_solve_matrix -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index rows = otherSize; typedef typename NumTraits::Real RealScalar; - typedef blas_data_mapper LhsMapper; + typedef blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - LhsMapper lhs(_other, otherStride); + LhsMapper lhs(_other, otherStride, otherIncr); RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; @@ -229,7 +231,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -51,8 +51,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -80,20 +82,26 @@ struct triangular_solve_matrix \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -104,8 +112,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -133,16 +143,22 @@ struct triangular_solve_matrix0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/externals/eigen/Eigen/src/Core/util/BlasUtil.h b/externals/eigen/Eigen/src/Core/util/BlasUtil.h index 6e6ee119..e16a5649 100644 --- a/externals/eigen/Eigen/src/Core/util/BlasUtil.h +++ b/externals/eigen/Eigen/src/Core/util/BlasUtil.h @@ -24,14 +24,14 @@ struct gebp_kernel; template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, - int ResStorageOrder> + int ResStorageOrder, int ResInnerStride> struct general_matrix_matrix_product; template struct general_matrix_vector_product; - -template struct conj_if; - -template<> struct conj_if { - template - inline T operator()(const T& x) const { return numext::conj(x); } - template - inline T pconj(const T& x) const { return internal::pconj(x); } -}; - -template<> struct conj_if { - template - inline const T& operator()(const T& x) const { return x; } - template - inline const T& pconj(const T& x) const { return x; } -}; - -// Generic implementation for custom complex types. -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType Scalar; - - EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const - { return conj_if()(x) * conj_if()(y); } -}; - -template struct conj_helper -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } -}; - -template struct conj_helper, std::complex, false,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } -}; - -template struct conj_helper, std::complex, true,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, std::complex, true,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, RealScalar, Conj,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if()(x)*y; } -}; - -template struct conj_helper, false,Conj> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x*conj_if()(y); } -}; - template struct get_factor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } }; @@ -155,13 +71,19 @@ class BlasVectorMapper { Scalar* m_data; }; -template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; +template +class BlasLinearMapper; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} +template +class BlasLinearMapper +{ +public: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) + : m_data(data) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -171,33 +93,86 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - protected: +protected: Scalar *m_data; }; // Lightweight helper class to access matrix coefficients. -template -class blas_data_mapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; +template +class blas_data_mapper; + +// TMP to help PacketBlock store implementation. +// There's currently no known use case for PacketBlock load. +// The default implementation assumes ColMajor order. +// It always store each packet sequentially one `stride` apart. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + i + (j + idx)*stride, block.packet[idx]); + } +}; + +// PacketBlockManagement specialization to take care of RowMajor order without ifs. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + j + (i + idx)*stride, block.packet[idx]); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; +template +class blas_data_mapper +{ +public: typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1) + : m_data(data), m_stride(stride) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper getSubMapper(Index i, Index j) const { @@ -218,12 +193,14 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -246,11 +223,167 @@ class blas_data_mapper { return internal::first_default_aligned(m_data, size); } - protected: + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock &block) const { + PacketBlockManagement pbm; + pbm.store(m_data, m_stride, i, j, block); + } +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; +// Implementation of non-natural increment (i.e. inner-stride != 1) +// The exposed API is not complete yet compared to the Incr==1 case +// because some features makes less sense in this case. +template +class BlasLinearMapper +{ +public: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i*m_incr.value()]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pscatter(m_data + i*m_incr.value(), p, m_incr.value()); + } + +protected: + Scalar *m_data; + const internal::variable_if_dynamic m_incr; +}; + +template +class blas_data_mapper +{ +public: + typedef BlasLinearMapper LinearMapper; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value()); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j), m_incr.value()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + + // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the Complex types. + template + struct storePacketBlock_helper + { + storePacketBlock_helper spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup, i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + ScalarT *v = &sup->operator()(i+l, j+idx); + *v = block.packet[idx][l]; + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper, n, idx> + { + storePacketBlock_helper, n, idx-1> spbh; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper* sup, Index i, Index j, const PacketBlock& block) const { + spbh.store(sup,i,j,block); + for(int l = 0; l < unpacket_traits::size; l++) + { + std::complex *v = &sup->operator()(i+l, j+idx); + v->real(block.packet[idx].v[2*l+0]); + v->imag(block.packet[idx].v[2*l+1]); + } + } + }; + + template + struct storePacketBlock_helper + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + + template + struct storePacketBlock_helper, n, -1> + { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(const blas_data_mapper*, Index, Index, const PacketBlock& ) const { + } + }; + // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be avoided when possible. + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock&block) const { + storePacketBlock_helper spb; + spb.store(this, i,j,block); + } +protected: + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; + const internal::variable_if_dynamic m_incr; +}; + // lightweight helper class to access matrix coefficients (const version) template class const_blas_data_mapper : public blas_data_mapper { @@ -278,14 +411,15 @@ template struct blas_traits HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) && ( bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time::ret) == 1) - ) ? 1 : 0 + ) ? 1 : 0, + HasScalarFactor = false }; typedef typename conditional::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -310,17 +444,23 @@ template struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; typedef typename Base::ExtractType ExtractType; @@ -339,6 +479,9 @@ template struct blas_traits, NestedXpr> > : blas_traits { + enum { + HasScalarFactor = true + }; typedef blas_traits Base; typedef CwiseUnaryOp, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; @@ -375,7 +518,7 @@ struct blas_traits template::HasUsableDirectAccess> struct extract_data_selector { - static const typename T::Scalar* run(const T& m) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) { return blas_traits::extract(m).data(); } @@ -386,11 +529,53 @@ struct extract_data_selector { static typename T::Scalar* run(const T&) { return 0; } }; -template const typename T::Scalar* extract_data(const T& m) +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) { return extract_data_selector::run(m); } +/** + * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products. + * There is a specialization for booleans + */ +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha * blas_traits::extractScalarFactor(lhs) * blas_traits::extractScalarFactor(rhs); + } +}; +template +struct combine_scalar_factors_impl +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) + { + return blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) + { + return alpha && blas_traits::extractScalarFactor(lhs) && blas_traits::extractScalarFactor(rhs); + } +}; + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(alpha, lhs, rhs); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) +{ + return combine_scalar_factors_impl::run(lhs, rhs); +} + + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/util/ConfigureVectorization.h b/externals/eigen/Eigen/src/Core/util/ConfigureVectorization.h new file mode 100644 index 00000000..af4e6962 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/util/ConfigureVectorization.h @@ -0,0 +1,512 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2018 Gael Guennebaud +// Copyright (C) 2020, Arm Limited and Contributors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONFIGURE_VECTORIZATION_H +#define EIGEN_CONFIGURE_VECTORIZATION_H + +//------------------------------------------------------------------------------------------ +// Static and dynamic alignment control +// +// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES +// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. +// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, +// a default value is automatically computed based on architecture, compiler, and OS. +// +// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} +// to be used to declare statically aligned buffers. +//------------------------------------------------------------------------------------------ + + +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, + * so that vectorization doesn't affect binary compatibility. + * + * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link + * vectorized and non-vectorized code. + * + * FIXME: this code can be cleaned up once we switch to proper C++11 only. + */ +#if (defined EIGEN_CUDACC) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_HAS_ALIGNAS + #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) + #define EIGEN_ALIGNOF(x) alignof(x) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_SUNCC + // FIXME not sure about this one: + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#else + #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler +#endif + +// If the user explicitly disable vectorization, then we also disable alignment +#if defined(EIGEN_DONT_VECTORIZE) + #if defined(EIGEN_GPUCC) + // GPU code is always vectorized and requires memory alignment for + // statically allocated buffers. + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 + #else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 + #endif +#elif defined(__AVX512F__) + // 64 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + // 32 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif + + +// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense +#define EIGEN_MIN_ALIGN_BYTES 16 + +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. + +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif + +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated +// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif + +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + + // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES + + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable + // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always + // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. + // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. + // 4.8 and newer seem definitely unaffected. + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + + // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif + +#endif + +// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) +// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). +// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. + + +// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif + + +// Dynamic alignment control + +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif + +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif + +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif + + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +//---------------------------------------------------------------------- + +// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into +// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif + + +// The following (except #include and _M_IX86_FP ??) can likely be +// removed as gcc 4.1 and msvc 2008 are not supported anyways. +#if EIGEN_COMP_MSVC + #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled + #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif + +#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) + + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + + // Defines symbols for compile-time detection of which instructions are + // used. + // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + + // Detect sse3/ssse3/sse4: + // gcc and icc defines __SSE3__, ... + // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you + // want to force the use of those instructions with msvc. + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) + // MSVC does not expose a switch dedicated for FMA + // For MSVC, AVX2 => FMA + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) + #ifndef EIGEN_VECTORIZE_FMA + #if EIGEN_COMP_GNUC + #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). + #else + #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). + #endif + #endif + #ifndef EIGEN_USE_SYCL + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #endif + #define EIGEN_VECTORIZE_FMA + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #ifndef EIGEN_USE_SYCL + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif + #ifdef __AVX512BF16__ + #define EIGEN_VECTORIZE_AVX512BF16 + #endif + #endif + #endif + + // Disable AVX support on broken xcode versions + #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 ) + // A nasty bug in the clang compiler shipped with xcode in a common compilation situation + // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1 + #ifdef EIGEN_VECTORIZE_AVX + #undef EIGEN_VECTORIZE_AVX + #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. " + #ifdef EIGEN_VECTORIZE_AVX2 + #undef EIGEN_VECTORIZE_AVX2 + #endif + #ifdef EIGEN_VECTORIZE_FMA + #undef EIGEN_VECTORIZE_FMA + #endif + #ifdef EIGEN_VECTORIZE_AVX512 + #undef EIGEN_VECTORIZE_AVX512 + #endif + #ifdef EIGEN_VECTORIZE_AVX512DQ + #undef EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef EIGEN_VECTORIZE_AVX512ER + #undef EIGEN_VECTORIZE_AVX512ER + #endif + #endif + // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX + // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests + // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests + // with -macosx-version-min=10.15 and AVX + // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with + // -macosx-version-min=10.15 and AVX + #endif + + // include files + + // This extern "C" works around a MINGW-w64 compilation issue + // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 + // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). + // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations + // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; + // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. + // notice that since these are C headers, the extern "C" is theoretically needed anyways. + extern "C" { + // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. + // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } // end extern "C" + + #elif defined __VSX__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif defined __ALTIVEC__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + + // We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and + // will not select the backend automatically + #elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SVE + #include + + // Since we depend on knowing SVE vector lengths at compile-time, we need + // to ensure a fixed lengths is set + #if defined __ARM_FEATURE_SVE_BITS + #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS + #else +#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." +#endif + +#elif (defined __s390x__ && defined __VEC__) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_ZVECTOR +#include + +#elif defined __mips_msa + +// Limit MSA optimizations to little-endian CPUs for now. +// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#if defined(__LP64__) +#define EIGEN_MIPS_64 +#else +#define EIGEN_MIPS_32 +#endif +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_MSA +#include +#endif + +#endif +#endif + +// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all +// compilers seem to follow this. We therefore include it explicitly. +// See also: https://bugs.llvm.org/show_bug.cgi?id=47955 +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) + #include +#endif + +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C + + #if defined(EIGEN_COMP_CLANG) + // Workaround for clang: The FP16C intrinsics for clang are included by + // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 + #include + #endif +#endif + +#if defined EIGEN_CUDACC + #define EIGEN_VECTORIZE_GPU + #include + #if EIGEN_CUDA_SDK_VER >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif + +#if defined(EIGEN_HAS_CUDA_FP16) + #include + #include +#endif + +#if defined(EIGEN_HIPCC) + #define EIGEN_VECTORIZE_GPU + #include + #define EIGEN_HAS_HIP_FP16 + #include +#endif + + +/** \brief Namespace containing all symbols from the %Eigen library. */ +namespace Eigen { + +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_SVE) + return "ARM SVE"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#elif defined(EIGEN_VECTORIZE_MSA) + return "MIPS MSA"; +#else + return "None"; +#endif +} + +} // end namespace Eigen + + +#endif // EIGEN_CONFIGURE_VECTORIZATION_H diff --git a/externals/eigen/Eigen/src/Core/util/Constants.h b/externals/eigen/Eigen/src/Core/util/Constants.h index 7587d684..35dcaa7b 100644 --- a/externals/eigen/Eigen/src/Core/util/Constants.h +++ b/externals/eigen/Eigen/src/Core/util/Constants.h @@ -3,6 +3,7 @@ // // Copyright (C) 2008-2015 Gael Guennebaud // Copyright (C) 2007-2009 Benoit Jacob +// Copyright (C) 2020, Arm Limited and Contributors // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -25,6 +26,10 @@ const int Dynamic = -1; */ const int DynamicIndex = 0xffffff; +/** This value means that the increment to go from one value to another in a sequence is not constant for each step. + */ +const int UndefinedIncr = 0xfffffe; + /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm(). * The value Infinity there means the L-infinity norm. */ @@ -152,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40; /** \deprecated \ingroup flags * * means the first coefficient packet is guaranteed to be aligned. - * An expression cannot has the AlignedBit without the PacketAccessBit flag. + * An expression cannot have the AlignedBit without the PacketAccessBit flag. * In other words, this means we are allow to perform an aligned packet access to the first element regardless * of the expression kind: * \code @@ -250,12 +255,6 @@ enum AlignmentType { #endif }; -/** \ingroup enums - * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */ -// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox -// TODO: find out what to do with that. Adapt the AlignedBox API ? -enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; - /** \ingroup enums * Enum containing possible values for the \p Direction parameter of * Reverse, PartialReduxExpr and VectorwiseOp. */ @@ -330,9 +329,20 @@ enum StorageOptions { * Enum for specifying whether to apply or solve on the left or right. */ enum SideType { /** Apply transformation on the left. */ - OnTheLeft = 1, + OnTheLeft = 1, /** Apply transformation on the right. */ - OnTheRight = 2 + OnTheRight = 2 +}; + +/** \ingroup enums + * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */ +enum NaNPropagationOptions { + /** Implementation defined behavior if NaNs are present. */ + PropagateFast = 0, + /** Always propagate NaNs. */ + PropagateNaN, + /** Always propagate not-NaNs. */ + PropagateNumbers }; /* the following used to be written as: @@ -464,6 +474,8 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, + MSA = 0x5, + SVE = 0x6, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -472,6 +484,10 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON +#elif defined EIGEN_VECTORIZE_SVE + Target = SVE +#elif defined EIGEN_VECTORIZE_MSA + Target = MSA #else Target = Generic #endif diff --git a/externals/eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/externals/eigen/Eigen/src/Core/util/DisableStupidWarnings.h index 7559e129..fe0cfec0 100644 --- a/externals/eigen/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/externals/eigen/Eigen/src/Core/util/DisableStupidWarnings.h @@ -4,7 +4,6 @@ #ifdef _MSC_VER // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable - // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -20,7 +19,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -42,17 +41,40 @@ #pragma clang diagnostic push #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #if __clang_major__ >= 3 && __clang_minor__ >= 5 + #pragma clang diagnostic ignored "-Wabsolute-value" + #endif + #if __clang_major__ >= 10 + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif -#elif defined __GNUC__ && __GNUC__>=6 +#elif defined __GNUC__ - #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push #endif - #pragma GCC diagnostic ignored "-Wignored-attributes" - + // g++ warns about local variables shadowing member functions, which is too strict + #pragma GCC diagnostic ignored "-Wshadow" + #if __GNUC__ == 4 && __GNUC_MINOR__ < 8 + // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions: + #pragma GCC diagnostic ignored "-Wtype-limits" + #endif + #if __GNUC__>=6 + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif + #if __GNUC__==7 + // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325 + #pragma GCC diagnostic ignored "-Wattributes" + #endif #endif #if defined __NVCC__ + #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message @@ -70,6 +92,15 @@ #pragma diag_suppress 2671 #pragma diag_suppress 2735 #pragma diag_suppress 2737 + #pragma diag_suppress 2739 #endif +#else +// warnings already disabled: +# ifndef EIGEN_WARNINGS_DISABLED_2 +# define EIGEN_WARNINGS_DISABLED_2 +# elif defined(EIGEN_INTERNAL_DEBUGGING) +# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!" +# endif + #endif // not EIGEN_WARNINGS_DISABLED diff --git a/externals/eigen/Eigen/src/Core/util/ForwardDeclarations.h b/externals/eigen/Eigen/src/Core/util/ForwardDeclarations.h index ea107393..2f9cc449 100644 --- a/externals/eigen/Eigen/src/Core/util/ForwardDeclarations.h +++ b/externals/eigen/Eigen/src/Core/util/ForwardDeclarations.h @@ -47,11 +47,7 @@ template struct NumTraits; template struct EigenBase; template class DenseBase; template class PlainObjectBase; - - -template::value > -class DenseCoeffsBase; +template class DenseCoeffsBase; template class ForceAlignedAccess; template class SwapWrapper; template class Block; +template class IndexedView; +template class Reshaped; template class VectorBlock; template class Transpose; @@ -112,7 +110,7 @@ template class TranspositionsWrapper; template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class MapBase; -template class Stride; +template class Stride; template class InnerStride; template class OuterStride; template > class Map; @@ -133,6 +131,10 @@ template class SolverBase; template class InnerIterator; namespace internal { +template class generic_randaccess_stl_iterator; +template class pointer_based_stl_iterator; +template class subvector_stl_iterator; +template class subvector_stl_reverse_iterator; template struct kernel_retval_base; template struct kernel_retval; template struct image_retval_base; @@ -178,14 +180,15 @@ template struct scalar_sum_op; template struct scalar_difference_op; template struct scalar_conj_product_op; -template struct scalar_min_op; -template struct scalar_max_op; +template struct scalar_min_op; +template struct scalar_max_op; template struct scalar_opposite_op; template struct scalar_conjugate_op; template struct scalar_real_op; template struct scalar_imag_op; template struct scalar_abs_op; template struct scalar_abs2_op; +template struct scalar_absolute_difference_op; template struct scalar_sqrt_op; template struct scalar_rsqrt_op; template struct scalar_exp_op; @@ -202,7 +205,7 @@ template struct scalar_cast_op; template struct scalar_random_op; template struct scalar_constant_op; template struct scalar_identity_op; -template struct scalar_sign_op; +template struct scalar_sign_op; template struct scalar_pow_op; template struct scalar_hypot_op; template struct scalar_product_op; @@ -213,11 +216,27 @@ template struct scalar_lgamma_op; template struct scalar_digamma_op; template struct scalar_erf_op; template struct scalar_erfc_op; +template struct scalar_ndtri_op; template struct scalar_igamma_op; template struct scalar_igammac_op; template struct scalar_zeta_op; template struct scalar_betainc_op; +// Bessel functions in SpecialFunctions module +template struct scalar_bessel_i0_op; +template struct scalar_bessel_i0e_op; +template struct scalar_bessel_i1_op; +template struct scalar_bessel_i1e_op; +template struct scalar_bessel_j0_op; +template struct scalar_bessel_y0_op; +template struct scalar_bessel_j1_op; +template struct scalar_bessel_y1_op; +template struct scalar_bessel_k0_op; +template struct scalar_bessel_k0e_op; +template struct scalar_bessel_k1_op; +template struct scalar_bessel_k1e_op; + + } // end namespace internal struct IOFormat; @@ -255,6 +274,7 @@ template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; +template class SVDBase; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/externals/eigen/Eigen/src/Core/util/IndexedViewHelper.h b/externals/eigen/Eigen/src/Core/util/IndexedViewHelper.h new file mode 100644 index 00000000..f85de305 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/util/IndexedViewHelper.h @@ -0,0 +1,186 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INDEXED_VIEW_HELPER_H +#define EIGEN_INDEXED_VIEW_HELPER_H + +namespace Eigen { + +namespace internal { +struct symbolic_last_tag {}; +} + +/** \var last + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns + * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * + * A typical usage example would be: + * \code + * using namespace Eigen; + * using Eigen::last; + * VectorXd v(n); + * v(seq(2,last-2)).setOnes(); + * \endcode + * + * \sa end + */ +static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + +/** \var lastp1 + * \ingroup Core_Module + * + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically + * reference the last+1 element/row/columns of the underlying vector or matrix once + * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * + * This symbolic placeholder supports standard arithmetic operations. + * It is essentially an alias to last+fix<1>. + * + * \sa last + */ +#ifdef EIGEN_PARSED_BY_DOXYGEN +static const auto lastp1 = last+fix<1>; +#else +// Using a FixedExpr<1> expression is important here to make sure the compiler +// can fully optimize the computation starting indices with zero overhead. +static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); +#endif + +namespace internal { + + // Replace symbolic last/end "keywords" by their true runtime value +inline Index eval_expr_given_size(Index x, Index /* size */) { return x; } + +template +FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } + +template +Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) +{ + return x.derived().eval(last=size-1); +} + +// Extract increment/step at compile time +template struct get_compile_time_incr { + enum { value = UndefinedIncr }; +}; + +// Analogue of std::get<0>(x), but tailored for our needs. +template +EIGEN_CONSTEXPR Index first(const T& x) EIGEN_NOEXCEPT { return x.first(); } + +// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice +// The generic implementation is a no-op +template +struct IndexedViewCompatibleType { + typedef T type; +}; + +template +const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; } + +//-------------------------------------------------------------------------------- +// Handling of a single Index +//-------------------------------------------------------------------------------- + +struct SingleRange { + enum { + SizeAtCompileTime = 1 + }; + SingleRange(Index val) : m_value(val) {} + Index operator[](Index) const { return m_value; } + static EIGEN_CONSTEXPR Index size() EIGEN_NOEXCEPT { return 1; } + Index first() const EIGEN_NOEXCEPT { return m_value; } + Index m_value; +}; + +template<> struct get_compile_time_incr { + enum { value = 1 }; // 1 or 0 ?? +}; + +// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) +template +struct IndexedViewCompatibleType::value>::type> { + // Here we could simply use Array, but maybe it's less work for the compiler to use + // a simpler wrapper as SingleRange + //typedef Eigen::Array type; + typedef SingleRange type; +}; + +template +struct IndexedViewCompatibleType::value>::type> { + typedef SingleRange type; +}; + + +template +typename enable_if::value,SingleRange>::type +makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { + return eval_expr_given_size(id,size); +} + +//-------------------------------------------------------------------------------- +// Handling of all +//-------------------------------------------------------------------------------- + +struct all_t { all_t() {} }; + +// Convert a symbolic 'all' into a usable range type +template +struct AllRange { + enum { SizeAtCompileTime = XprSize }; + AllRange(Index size = XprSize) : m_size(size) {} + EIGEN_CONSTEXPR Index operator[](Index i) const EIGEN_NOEXCEPT { return i; } + EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { return m_size.value(); } + EIGEN_CONSTEXPR Index first() const EIGEN_NOEXCEPT { return 0; } + variable_if_dynamic m_size; +}; + +template +struct IndexedViewCompatibleType { + typedef AllRange type; +}; + +template +inline AllRange::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) { + return AllRange::value>(size); +} + +template struct get_compile_time_incr > { + enum { value = 1 }; +}; + +} // end namespace internal + + +/** \var all + * \ingroup Core_Module + * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns + */ +static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + + +namespace placeholders { + typedef symbolic::SymbolExpr last_t; + typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; + typedef Eigen::internal::all_t all_t; + + EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end +} + +} // end namespace Eigen + +#endif // EIGEN_INDEXED_VIEW_HELPER_H diff --git a/externals/eigen/Eigen/src/Core/util/IntegralConstant.h b/externals/eigen/Eigen/src/Core/util/IntegralConstant.h new file mode 100644 index 00000000..945d426e --- /dev/null +++ b/externals/eigen/Eigen/src/Core/util/IntegralConstant.h @@ -0,0 +1,272 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_INTEGRAL_CONSTANT_H +#define EIGEN_INTEGRAL_CONSTANT_H + +namespace Eigen { + +namespace internal { + +template class FixedInt; +template class VariableAndFixedInt; + +/** \internal + * \class FixedInt + * + * This class embeds a compile-time integer \c N. + * + * It is similar to c++11 std::integral_constant but with some additional features + * such as: + * - implicit conversion to int + * - arithmetic and some bitwise operators: -, +, *, /, %, &, | + * - c++98/14 compatibility with fix and fix() syntax to define integral constants. + * + * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to + * be created by the user using Eigen::fix or Eigen::fix(). In C++98-11, the former syntax does + * not create a FixedInt instance but rather a point to function that needs to be \em cleaned-up + * using the generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can a FixedInt, a pointer to function FixedInt (*)(), or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can extract the compile-time value \c N in a generic way using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals FixedInt or FixedInt (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix, class VariableAndFixedInt + */ +template class FixedInt +{ +public: + static const int value = N; + EIGEN_CONSTEXPR operator int() const { return value; } + FixedInt() {} + FixedInt( VariableAndFixedInt other) { + #ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(other); + #endif + eigen_internal_assert(int(other)==N); + } + + FixedInt<-N> operator-() const { return FixedInt<-N>(); } + template + FixedInt operator+( FixedInt) const { return FixedInt(); } + template + FixedInt operator-( FixedInt) const { return FixedInt(); } + template + FixedInt operator*( FixedInt) const { return FixedInt(); } + template + FixedInt operator/( FixedInt) const { return FixedInt(); } + template + FixedInt operator%( FixedInt) const { return FixedInt(); } + template + FixedInt operator|( FixedInt) const { return FixedInt(); } + template + FixedInt operator&( FixedInt) const { return FixedInt(); } + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + // Needed in C++14 to allow fix(): + FixedInt operator() () const { return *this; } + + VariableAndFixedInt operator() (int val) const { return VariableAndFixedInt(val); } +#else + FixedInt ( FixedInt (*)() ) {} +#endif + +#if EIGEN_HAS_CXX11 + FixedInt(std::integral_constant) {} +#endif +}; + +/** \internal + * \class VariableAndFixedInt + * + * This class embeds both a compile-time integer \c N and a runtime integer. + * Both values are supposed to be equal unless the compile-time value \c N has a special + * value meaning that the runtime-value should be used. Depending on the context, this special + * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for + * quantities that can be negative). + * + * It is the return-type of the function Eigen::fix(int), and most of the time this is the only + * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt. + * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert + * it to either a true compile-time quantity (i.e. a FixedInt), or to a runtime quantity (e.g., an Index) + * using the following generic helper: + * \code + * internal::cleanup_index_type::type + * internal::cleanup_index_type::type + * \endcode + * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations. + * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. + * + * For convenience, you can also extract the compile-time value \c N using the following helper: + * \code + * internal::get_fixed_value::value + * \endcode + * that will give you \c N if T equals VariableAndFixedInt, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). + * + * \sa fix(int), class FixedInt + */ +template class VariableAndFixedInt +{ +public: + static const int value = N; + operator int() const { return m_value; } + VariableAndFixedInt(int val) { m_value = val; } +protected: + int m_value; +}; + +template struct get_fixed_value { + static const int value = Default; +}; + +template struct get_fixed_value,Default> { + static const int value = N; +}; + +#if !EIGEN_HAS_CXX14 +template struct get_fixed_value (*)(),Default> { + static const int value = N; +}; +#endif + +template struct get_fixed_value,Default> { + static const int value = N ; +}; + +template +struct get_fixed_value,Default> { + static const int value = N; +}; + +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } +#if !EIGEN_HAS_CXX14 +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } +#endif + +// Cleanup integer/FixedInt/VariableAndFixedInt/etc types: + +// By default, no cleanup: +template struct cleanup_index_type { typedef T type; }; + +// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index +template struct cleanup_index_type::value>::type> { typedef Index type; }; + +#if !EIGEN_HAS_CXX14 +// In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: +template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; +#endif + +// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value: +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index): +template struct cleanup_index_type, DynamicKey> { typedef Index type; }; + +#if EIGEN_HAS_CXX11 +template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; +#endif + +} // end namespace internal + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES +template +static const internal::FixedInt fix{}; +#else +template +inline internal::FixedInt fix() { return internal::FixedInt(); } + +// The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. +// This way a code like fix can only refer to the previous function. +template +inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** \var fix() + * \ingroup Core_Module + * + * This \em identifier permits to construct an object embedding a compile-time integer \c N. + * + * \tparam N the compile-time integer value + * + * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them: + * \code + * seqN(10,fix<4>,fix<-3>) // <=> [10 7 4 1] + * \endcode + * + * See also the function fix(int) to pass both a compile-time and runtime value. + * + * In c++14, it is implemented as: + * \code + * template static const internal::FixedInt fix{}; + * \endcode + * where internal::FixedInt is an internal template class similar to + * \c std::integral_constant + * Here, \c fix is thus an object of type \c internal::FixedInt. + * + * In c++98/11, it is implemented as a function: + * \code + * template inline internal::FixedInt fix(); + * \endcode + * Here internal::FixedInt is thus a pointer to function. + * + * If for some reason you want a true object in c++98 then you can write: \code fix() \endcode which is also valid in c++14. + * + * \sa fix(int), seq, seqN + */ +template +static const auto fix(); + +/** \fn fix(int) + * \ingroup Core_Module + * + * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val. + * + * \tparam N the compile-time integer value + * \param val the fallback runtime integer value + * + * This function is a more general version of the \ref fix identifier/function that can be used in template code + * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers + * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers + * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val + * will be used as a fallback. + * + * A typical use case would be: + * \code + * template void foo(const MatrixBase &mat) { + * const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2; + * const int n = mat.rows()/2; + * ... mat( seqN(0,fix(n) ) ...; + * } + * \endcode + * In this example, the function Eigen::seqN knows that the second argument is expected to be a size. + * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n. + * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as seqN(0,fix) . + * + * \sa fix, seqN, class ArithmeticSequence + */ +template +static const auto fix(int val); + +#endif // EIGEN_PARSED_BY_DOXYGEN + +} // end namespace Eigen + +#endif // EIGEN_INTEGRAL_CONSTANT_H diff --git a/externals/eigen/Eigen/src/Core/util/MKL_support.h b/externals/eigen/Eigen/src/Core/util/MKL_support.h index 26b59669..17963fad 100644 --- a/externals/eigen/Eigen/src/Core/util/MKL_support.h +++ b/externals/eigen/Eigen/src/Core/util/MKL_support.h @@ -49,12 +49,17 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL) #define EIGEN_USE_MKL #endif + #if defined EIGEN_USE_MKL -# include +# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL) +# define MKL_DIRECT_CALL +# define MKL_DIRECT_CALL_JUST_SET +# endif +# include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ # ifndef INTEL_MKL_VERSION # undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ @@ -68,6 +73,9 @@ # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT # undef EIGEN_USE_LAPACKE +# ifdef MKL_DIRECT_CALL_JUST_SET +# undef MKL_DIRECT_CALL +# endif # endif #endif @@ -108,6 +116,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +133,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/externals/eigen/Eigen/src/Core/util/Macros.h b/externals/eigen/Eigen/src/Core/util/Macros.h index 427d3cd6..986c3d44 100644 --- a/externals/eigen/Eigen/src/Core/util/Macros.h +++ b/externals/eigen/Eigen/src/Core/util/Macros.h @@ -11,19 +11,56 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H +//------------------------------------------------------------------------------------------ +// Eigen version and basic defaults +//------------------------------------------------------------------------------------------ + #define EIGEN_WORLD_VERSION 3 -#define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 3 +#define EIGEN_MAJOR_VERSION 4 +#define EIGEN_MINOR_VERSION 0 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ EIGEN_MINOR_VERSION>=z)))) +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#else +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor +#endif + +#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#endif + +// Upperbound on the C++ version to use. +// Expected values are 03, 11, 14, 17, etc. +// By default, let's use an arbitrarily large C++ version. +#ifndef EIGEN_MAX_CPP_VER +#define EIGEN_MAX_CPP_VER 99 +#endif + +/** Allows to disable some optimizations which might affect the accuracy of the result. + * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. + * They currently include: + * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. + */ +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 +#endif + +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +// 131072 == 128 KB +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif + +//------------------------------------------------------------------------------------------ // Compiler identification, EIGEN_COMP_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC 1 + #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) #else #define EIGEN_COMP_GNUC 0 #endif @@ -35,6 +72,12 @@ #define EIGEN_COMP_CLANG 0 #endif +/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML +#if defined(__castxml__) + #define EIGEN_COMP_CASTXML 1 +#else + #define EIGEN_COMP_CASTXML 0 +#endif /// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm #if defined(__llvm__) @@ -71,14 +114,44 @@ #define EIGEN_COMP_MSVC 0 #endif +#if defined(__NVCC__) +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) + #define EIGEN_COMP_NVCC __CUDACC_VER__ +#else + #error "NVCC did not define compiler version." +#endif +#else + #define EIGEN_COMP_NVCC 0 +#endif + // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: -// name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 +// name ver MSC_VER +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 +// 2017-14.1 15.0 1910 +// 2017-14.11 15.3 1911 +// 2017-14.12 15.5 1912 +// 2017-14.13 15.6 1913 +// 2017-14.14 15.7 1914 + +/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise. +#if defined(_MSVC_LANG) + #define EIGEN_COMP_MSVC_LANG _MSVC_LANG +#else + #define EIGEN_COMP_MSVC_LANG 0 +#endif + +// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG: +// MSVC option Standard MSVC_LANG +// /std:c++14 (default as of VS 2019) C++14 201402L +// /std:c++17 C++17 201703L +// /std:c++latest >C++17 >201703L /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) @@ -87,16 +160,21 @@ #define EIGEN_COMP_MSVC_STRICT 0 #endif -/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++ -#if defined(__IBMCPP__) || defined(__xlc__) - #define EIGEN_COMP_IBM 1 +/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ +// XLC version +// 3.1 0x0301 +// 4.5 0x0405 +// 5.0 0x0500 +// 12.1 0x0C01 +#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) + #define EIGEN_COMP_IBM __xlC__ #else #define EIGEN_COMP_IBM 0 #endif -/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler +/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler #if defined(__PGI) - #define EIGEN_COMP_PGI 1 + #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__) #else #define EIGEN_COMP_PGI 0 #endif @@ -108,7 +186,7 @@ #define EIGEN_COMP_ARM 0 #endif -/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler +/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler #if defined(__EMSCRIPTEN__) #define EIGEN_COMP_EMSCRIPTEN 1 #else @@ -142,9 +220,13 @@ #endif + +//------------------------------------------------------------------------------------------ // Architecture identification, EIGEN_ARCH_* +//------------------------------------------------------------------------------------------ + -#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64) #define EIGEN_ARCH_x86_64 1 #else #define EIGEN_ARCH_x86_64 0 @@ -170,18 +252,61 @@ #endif /// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64 -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) #define EIGEN_ARCH_ARM64 1 #else #define EIGEN_ARCH_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64 #if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64 #define EIGEN_ARCH_ARM_OR_ARM64 1 #else #define EIGEN_ARCH_ARM_OR_ARM64 0 #endif +/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater. +#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8 +#define EIGEN_ARCH_ARMV8 1 +#else +#define EIGEN_ARCH_ARMV8 0 +#endif + + +/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE +/// compliant Arm fp16 type +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16 + #if defined(__ARM_FP16_FORMAT_IEEE) + #define EIGEN_HAS_ARM64_FP16 1 + #else + #define EIGEN_HAS_ARM64_FP16 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -212,7 +337,9 @@ +//------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant #if defined(__unix__) || defined(__unix) @@ -299,9 +426,17 @@ #define EIGEN_OS_WIN_STRICT 0 #endif -/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN +/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN +// compiler solaris __SUNPRO_C +// version studio +// 5.7 10 0x570 +// 5.8 11 0x580 +// 5.9 12 0x590 +// 5.10 12.1 0x5100 +// 5.11 12.2 0x5110 +// 5.12 12.3 0x5120 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__)) - #define EIGEN_OS_SUN 1 + #define EIGEN_OS_SUN __SUNPRO_C #else #define EIGEN_OS_SUN 0 #endif @@ -314,26 +449,137 @@ #endif +//------------------------------------------------------------------------------------------ +// Detect GPU compilers and architectures +//------------------------------------------------------------------------------------------ -#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG - // see bug 89 - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0 -#else - #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." #endif -// This macro can be used to prevent from macro expansion, e.g.: -// std::max EIGEN_NOT_A_MACRO(a,b) -#define EIGEN_NOT_A_MACRO +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + // Means the compiler is either nvcc or clang with CUDA enabled + #define EIGEN_CUDACC __CUDACC__ +#endif -#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + // Means we are generating code for the device + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(EIGEN_CUDACC) +#include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) #else -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor + #define EIGEN_CUDA_SDK_VER 0 #endif -#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) + #define EIGEN_HIPCC __HIPCC__ + + // We need to include hip_runtime.h here because it pulls in + // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ + // ++ host_defines.h which contains the defines for the __host__ and __device__ macros + #include + + #if defined(__HIP_DEVICE_COMPILE__) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + #endif + + // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute + // value to 1024. The compiler assigns a default value of 256 when the attribute is not + // specified. This results in failures on the HIP platform, for cases when a GPU kernel + // without an explicit launch_bounds attribute is called with a threads_per_block value + // greater than 256. + // + // This is a regression in functioanlity and is expected to be fixed within the next + // couple of ROCm releases (compiler will go back to using 1024 value as the default) + // + // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds + // attribute. + + #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) + +#endif + +#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) +#define EIGEN_HIP_LAUNCH_BOUNDS_1024 +#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) + +// Unify CUDA/HIPCC + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + +#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. +// In most cases we want to check if both macros are defined which can be done using the define below. +#define SYCL_DEVICE_ONLY +#endif + +//------------------------------------------------------------------------------------------ +// Detect Compiler/Architecture/OS specific features +//------------------------------------------------------------------------------------------ + +#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG + // see bug 89 + #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0 +#else + #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 #endif // Cross compiler wrapper around LLVM's __has_builtin @@ -349,26 +595,79 @@ # define __has_feature(x) 0 #endif -// Upperbound on the C++ version to use. -// Expected values are 03, 11, 14, 17, etc. -// By default, let's use an arbitrarily large C++ version. -#ifndef EIGEN_MAX_CPP_VER -#define EIGEN_MAX_CPP_VER 99 +// Some old compilers do not support template specializations like: +// template void foo(const T x[N]); +#if !( EIGEN_COMP_CLANG && ( (EIGEN_COMP_CLANG<309) \ + || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000))) \ + || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 +#else +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 #endif -#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) +// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that +// works for both platforms, indicating the C++ standard version number. +// +// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will +// report 199711L regardless of the language standard specified via /std. +// We need to rely on _MSVC_LANG instead, which is only available after +// VS2015.3. +#if EIGEN_COMP_MSVC_LANG > 0 +#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG +#elif EIGEN_COMP_MSVC >= 1900 +#define EIGEN_CPLUSPLUS 201103L +#elif defined(__cplusplus) +#define EIGEN_CPLUSPLUS __cplusplus +#else +#define EIGEN_CPLUSPLUS 0 +#endif + +// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. +// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER +// is defined to 17. +#if EIGEN_CPLUSPLUS > 201703L + #define EIGEN_COMP_CXXVER 20 +#elif EIGEN_CPLUSPLUS > 201402L + #define EIGEN_COMP_CXXVER 17 +#elif EIGEN_CPLUSPLUS > 201103L + #define EIGEN_COMP_CXXVER 14 +#elif EIGEN_CPLUSPLUS >= 201103L + #define EIGEN_COMP_CXXVER 11 +#else + #define EIGEN_COMP_CXXVER 03 +#endif + +#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 + #else + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 + #endif +#endif + + +// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features +// but in practice we should not rely on them but rather on the availabilty of +// individual features as defined later. +// This is why there is no EIGEN_HAS_CXX17. +// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 #define EIGEN_HAS_CXX11 1 #else #define EIGEN_HAS_CXX11 0 #endif +#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 +#define EIGEN_HAS_CXX14 1 +#else +#define EIGEN_HAS_CXX14 0 +#endif // Do we support r-value references? #ifndef EIGEN_HAS_RVALUE_REFERENCES #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_rvalue_references) || \ - (defined(__cplusplus) && __cplusplus >= 201103L) || \ - (EIGEN_COMP_MSVC >= 1600)) + (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) #define EIGEN_HAS_RVALUE_REFERENCES 1 #else #define EIGEN_HAS_RVALUE_REFERENCES 0 @@ -376,11 +675,14 @@ #endif // Does the compiler support C99? +// Need to include to make sure _GLIBCXX_USE_C99 gets defined +#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ - || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))) + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ + || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY)) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -388,21 +690,73 @@ #endif // Does the compiler support result_of? +// result_of was deprecated in c++17 and removed in c++ 20 #ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) +#if EIGEN_HAS_CXX11 && EIGEN_COMP_CXXVER < 17 #define EIGEN_HAS_STD_RESULT_OF 1 #else #define EIGEN_HAS_STD_RESULT_OF 0 #endif #endif +// Does the compiler support std::hash? +#ifndef EIGEN_HAS_STD_HASH +// The std::hash struct is defined in C++11 but is not labelled as a __device__ +// function and is not constexpr, so cannot be used on device. +#if EIGEN_HAS_CXX11 && !defined(EIGEN_GPU_COMPILE_PHASE) +#define EIGEN_HAS_STD_HASH 1 +#else +#define EIGEN_HAS_STD_HASH 0 +#endif +#endif // EIGEN_HAS_STD_HASH + +#ifndef EIGEN_HAS_STD_INVOKE_RESULT +#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 +#define EIGEN_HAS_STD_INVOKE_RESULT 1 +#else +#define EIGEN_HAS_STD_INVOKE_RESULT 0 +#endif +#endif + +#ifndef EIGEN_HAS_ALIGNAS +#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ + ( __has_feature(cxx_alignas) \ + || EIGEN_HAS_CXX14 \ + || (EIGEN_COMP_MSVC >= 1800) \ + || (EIGEN_GNUC_AT_LEAST(4,8)) \ + || (EIGEN_COMP_CLANG>=305) \ + || (EIGEN_COMP_ICC>=1500) \ + || (EIGEN_COMP_PGI>=1500) \ + || (EIGEN_COMP_SUNCC>=0x5130)) +#define EIGEN_HAS_ALIGNAS 1 +#else +#define EIGEN_HAS_ALIGNAS 0 +#endif +#endif + +// Does the compiler support type_traits? +// - full support of type traits was added only to GCC 5.1.0. +// - 20150626 corresponds to the last release of 4.x libstdc++ +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \ + && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \ + && ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES -#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) \ + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#elif EIGEN_MAX_CPP_VER>=11 && (EIGEN_COMP_CXXVER >= 11) && defined(SYCL_DEVICE_ONLY) +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 #else #define EIGEN_HAS_VARIADIC_TEMPLATES 0 #endif @@ -410,27 +764,33 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR + #if defined(EIGEN_CUDACC) + // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above + #if EIGEN_MAX_CPP_VER>=14 && (EIGEN_COMP_CXXVER >= 11 && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 + #endif + #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (EIGEN_COMP_CXXVER >= 14) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (EIGEN_COMP_CXXVER >= 11)) || \ + (EIGEN_COMP_CLANG >= 306 && (EIGEN_COMP_CXXVER >= 11))) + #define EIGEN_HAS_CONSTEXPR 1 + #endif -#ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 -#endif -#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))) -#define EIGEN_HAS_CONSTEXPR 1 -#endif + #ifndef EIGEN_HAS_CONSTEXPR + #define EIGEN_HAS_CONSTEXPR 0 + #endif -#ifndef EIGEN_HAS_CONSTEXPR -#define EIGEN_HAS_CONSTEXPR 0 -#endif +#endif // EIGEN_HAS_CONSTEXPR +#if EIGEN_HAS_CONSTEXPR +#define EIGEN_CONSTEXPR constexpr +#else +#define EIGEN_CONSTEXPR #endif // Does the compiler support C++11 math? // Let's be conservative and enable the default C++11 implementation only if we are sure it exists #ifndef EIGEN_HAS_CXX11_MATH - #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + #if EIGEN_MAX_CPP_VER>=11 && ((EIGEN_COMP_CXXVER > 11) || (EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)) #define EIGEN_HAS_CXX11_MATH 1 #else @@ -441,9 +801,8 @@ // Does the compiler support proper C++11 containers? #ifndef EIGEN_HAS_CXX11_CONTAINERS #if EIGEN_MAX_CPP_VER>=11 && \ - ((__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + ((EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_CONTAINERS 1 #else #define EIGEN_HAS_CXX11_CONTAINERS 0 @@ -454,24 +813,88 @@ #ifndef EIGEN_HAS_CXX11_NOEXCEPT #if EIGEN_MAX_CPP_VER>=11 && \ (__has_feature(cxx_noexcept) \ - || (__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ - || EIGEN_COMP_MSVC >= 1900) + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC>=1400))) #define EIGEN_HAS_CXX11_NOEXCEPT 1 #else #define EIGEN_HAS_CXX11_NOEXCEPT 0 #endif #endif -/** Allows to disable some optimizations which might affect the accuracy of the result. - * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. - * They currently include: - * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. - */ -#ifndef EIGEN_FAST_MATH -#define EIGEN_FAST_MATH 1 +#ifndef EIGEN_HAS_CXX11_ATOMIC + #if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_atomic) \ + || (EIGEN_COMP_CXXVER > 11) \ + || ((EIGEN_COMP_CXXVER == 11) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) + #define EIGEN_HAS_CXX11_ATOMIC 1 + #else + #define EIGEN_HAS_CXX11_ATOMIC 0 + #endif +#endif + +#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL + #if EIGEN_MAX_CPP_VER>=11 && \ + (EIGEN_COMP_CXXVER >= 11 || EIGEN_COMP_MSVC >= 1700) + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1 + #else + #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0 + #endif +#endif + +// NOTE: the required Apple's clang version is very conservative +// and it could be that XCode 9 works just fine. +// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support +// and not tested. +#ifndef EIGEN_HAS_CXX17_OVERALIGN +#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \ + (EIGEN_COMP_MSVC >= 1912) \ + || (EIGEN_GNUC_AT_LEAST(7,0)) \ + || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \ + || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \ + ) +#define EIGEN_HAS_CXX17_OVERALIGN 1 +#else +#define EIGEN_HAS_CXX17_OVERALIGN 0 +#endif +#endif + +#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr) + // clang++ always considers constexpr functions as implicitly __host__ __device__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif #endif +// Does the compiler support the __int128 and __uint128_t extensions for 128-bit +// integer arithmetic? +// +// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported, +// but we avoid using them in certain cases: +// +// * Building using Clang for Windows, where the Clang runtime library has +// 128-bit support only on LP64 architectures, but Windows is LLP64. +#ifndef EIGEN_HAS_BUILTIN_INT128 +#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG) +#define EIGEN_HAS_BUILTIN_INT128 1 +#else +#define EIGEN_HAS_BUILTIN_INT128 0 +#endif +#endif + +//------------------------------------------------------------------------------------------ +// Preprocessor programming helpers +//------------------------------------------------------------------------------------------ + +// This macro can be used to prevent from macro expansion, e.g.: +// std::max EIGEN_NOT_A_MACRO(a,b) +#define EIGEN_NOT_A_MACRO + #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; // concatenate two tokens @@ -487,11 +910,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. -#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC +#ifndef EIGEN_STRONG_INLINE +#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC) #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, @@ -501,7 +926,7 @@ // Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval Eigen::MatrixBase::eval() const' // : function body not available // See also bug 1367 -#if EIGEN_GNUC_AT_LEAST(4,2) +#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY) #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline #else #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE @@ -521,12 +946,43 @@ #define EIGEN_PERMISSIVE_EXPR #endif +// GPU stuff + +// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC) +#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) + // Do not try asserts on device code + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif +#endif + +#if defined(SYCL_DEVICE_ONLY) + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) +// All functions callable from CUDA/HIP code must be qualified with __device__ +#elif defined(EIGEN_GPUCC) + #define EIGEN_DEVICE_FUNC __host__ __device__ +#else + #define EIGEN_DEVICE_FUNC +#endif + + // this macro allows to get rid of linking errors about multiply defined functions. // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG @@ -536,7 +992,11 @@ // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89 #ifdef EIGEN_NO_DEBUG - #define eigen_plain_assert(x) + #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device + #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x) + #else + #define eigen_plain_assert(x) + #endif #else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { @@ -610,7 +1070,7 @@ // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -624,169 +1084,75 @@ namespace Eigen { #endif -//------------------------------------------------------------------------------------------ -// Static and dynamic alignment control +// Acts as a barrier preventing operations involving `X` from crossing. This +// occurs, for example, in the fast rounding trick where a magic constant is +// added then subtracted, which is otherwise compiled away with -ffast-math. // -// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES -// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. -// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, -// a default value is automatically computed based on architecture, compiler, and OS. -// -// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} -// to be used to declare statically aligned buffers. -//------------------------------------------------------------------------------------------ - - -/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. - * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, - * so that vectorization doesn't affect binary compatibility. - * - * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link - * vectorized and non-vectorized code. - */ -#if (defined __CUDACC__) - #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#else - #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler -#endif - -// If the user explicitly disable vectorization, then we also disable alignment -#if defined(EIGEN_DONT_VECTORIZE) - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 -#elif defined(EIGEN_VECTORIZE_AVX512) - // 64 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 -#elif defined(__AVX__) - // 32 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 -#else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 -#endif - - -// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense -#define EIGEN_MIN_ALIGN_BYTES 16 - -// Defined the boundary (in bytes) on which the data needs to be aligned. Note -// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be -// aligned at all regardless of the value of this #define. - -#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. -#endif - -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated -// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 -#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) - #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES - #undef EIGEN_MAX_STATIC_ALIGN_BYTES - #endif - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 -#endif - -#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES - - // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - - // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable - // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always - // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in - // certain common platform (compiler+architecture combinations) to avoid these problems. - // Only static alignment is really problematic (relies on nonstandard compiler extensions), - // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #else - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 - #endif - - // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ - && !EIGEN_COMP_SUNCC \ - && !EIGEN_OS_QNX - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 - #else - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 - #endif - - #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT - #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +// See bug 1674 +#if !defined(EIGEN_OPTIMIZATION_BARRIER) + #if EIGEN_COMP_GNUC + // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: + // X: Any operand whatsoever. + // r: A register operand is allowed provided that it is in a general + // register. + // g: Any register, memory or immediate integer operand is allowed, except + // for registers that are not general registers. + // w: (AArch32/AArch64) Floating point register, Advanced SIMD vector + // register or SVE vector register. + // x: (SSE) Any SSE register. + // (AArch64) Like w, but restricted to registers 0 to 15 inclusive. + // v: (PowerPC) An Altivec vector register. + // wa:(PowerPC) A VSX register. + // + // "X" (uppercase) should work for all cases, though this seems to fail for + // some versions of GCC for arm/aarch64 with + // "error: inconsistent operand constraints in an 'asm'" + // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and + // vectors, otherwise + // "error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type" + // + // GCC for ppc64le generates an internal compiler error with x/X/g. + // GCC for AVX generates an internal compiler error with X. + // + // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq + // gcc for arm, aarch64, + // gcc for ppc64le, + // both vectors and scalars. + // + // Note that this is restricted to plain types - this will not work + // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, + // you will need to apply to the underlying POD type. + #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT + // This seems to be broken on clang. Packet4f is loaded into a single + // register rather than a vector, zeroing out some entries. Integer + // types also generate a compile error. + // General, Altivec, VSX. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #elif EIGEN_ARCH_ARM_OR_ARM64 + // General, NEON. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #elif EIGEN_ARCH_i386_OR_x86_64 + // General, SSE. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); + #else + // Not implemented for other architectures. + #define EIGEN_OPTIMIZATION_BARRIER(X) + #endif #else - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 - #endif - -#endif - -// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES -#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) -// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). -// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. - - -// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY -#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) -#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) -#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) -#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) -#else -#define EIGEN_ALIGN_MAX -#endif - - -// Dynamic alignment control - -#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 -#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. -#endif - -#ifdef EIGEN_DONT_ALIGN - #ifdef EIGEN_MAX_ALIGN_BYTES - #undef EIGEN_MAX_ALIGN_BYTES + // Not implemented for other compilers. + #define EIGEN_OPTIMIZATION_BARRIER(X) #endif - #define EIGEN_MAX_ALIGN_BYTES 0 -#elif !defined(EIGEN_MAX_ALIGN_BYTES) - #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES #endif -#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#if EIGEN_COMP_MSVC + // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362. + // This workaround is ugly, but it does the job. +# define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond #else -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES -#endif - - -#ifndef EIGEN_UNALIGNED_VECTORIZE -#define EIGEN_UNALIGNED_VECTORIZE 1 +# define EIGEN_CONST_CONDITIONAL(cond) cond #endif -//---------------------------------------------------------------------- - - #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif @@ -794,10 +1160,6 @@ namespace Eigen { #define EIGEN_RESTRICT __restrict #endif -#ifndef EIGEN_STACK_ALLOCATION_LIMIT -// 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 -#endif #ifndef EIGEN_DEFAULT_IO_FORMAT #ifdef EIGEN_MAKING_DOCS @@ -812,7 +1174,23 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) + +// When compiling CUDA/HIP device code with NVCC or HIPCC +// pull in math functions from the global namespace. +// In host mode, and when device code is compiled with clang, +// use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_USING_STD(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD(FUNC) using std::FUNC; +#endif + +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) + // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, + // otherwise we get duplicate definition errors + // For later MSVC versions, we require explicit operator= definition, otherwise we get + // use of implicitly deleted operator errors. + // (cf Bugs 920, 1000, 1324, 2291) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -832,11 +1210,48 @@ namespace Eigen { #endif +/** + * \internal + * \brief Macro to explicitly define the default copy constructor. + * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; +#else +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) +#endif + + + /** \internal * \brief Macro to manually inherit assignment operators. * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * With C++11 or later this also default-implements the copy-constructor */ -#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +/** \internal + * \brief Macro to manually define default constructors and destructors. + * This is necessary when the copy constructor is re-defined. + * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects. + * + * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + Derived() = default; \ + ~Derived() = default; +#else +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + Derived() {}; \ + /* ~Derived() {}; */ +#endif + + + + /** * Just a side note. Commenting within defines works only by documenting @@ -853,7 +1268,8 @@ namespace Eigen { typedef typename Eigen::internal::ref_selector::type Nested; \ typedef typename Eigen::internal::traits::StorageKind StorageKind; \ typedef typename Eigen::internal::traits::StorageIndex StorageIndex; \ - enum { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ + enum CompileTimeTraits \ + { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ ColsAtCompileTime = Eigen::internal::traits::ColsAtCompileTime, \ Flags = Eigen::internal::traits::Flags, \ SizeAtCompileTime = Base::SizeAtCompileTime, \ @@ -898,6 +1314,14 @@ namespace Eigen { #define EIGEN_IMPLIES(a,b) (!(a) || (b)) +#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC +#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false)) +#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true)) +#else +#define EIGEN_PREDICT_FALSE(x) (x) +#define EIGEN_PREDICT_TRUE(x) (x) +#endif + // the expression type of a standard coefficient wise binary operation #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \ CwiseBinaryOp< \ @@ -929,14 +1353,14 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT<=1600 +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X #endif #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -945,7 +1369,7 @@ namespace Eigen { } #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC inline friend \ + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -958,15 +1382,23 @@ namespace Eigen { EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_EXCEPTIONS +#endif + + #ifdef EIGEN_EXCEPTIONS # define EIGEN_THROW_X(X) throw X # define EIGEN_THROW throw # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() @@ -986,7 +1418,47 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 + // MSVC does not support exception specifications (warning C4290), + // and they are deprecated in c++11 anyway. This is even an error in c++17. +# define EIGEN_EXCEPTION_SPEC(X) throw() +# else +# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# endif +#endif + +#if EIGEN_HAS_VARIADIC_TEMPLATES +// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. +namespace Eigen { +namespace internal { + +inline bool all(){ return true; } + +template +bool all(T t, Ts ... ts){ return t && all(ts...); } + +} +} +#endif + +#if EIGEN_HAS_CXX11_OVERRIDE_FINAL +// provide override and final specifiers if they are available: +# define EIGEN_OVERRIDE override +# define EIGEN_FINAL final +#else +# define EIGEN_OVERRIDE +# define EIGEN_FINAL +#endif + +// Wrapping #pragma unroll in a macro since it is required for SYCL +#if defined(SYCL_DEVICE_ONLY) + #if defined(_MSC_VER) + #define EIGEN_UNROLL_LOOP __pragma(unroll) + #else + #define EIGEN_UNROLL_LOOP _Pragma("unroll") + #endif +#else + #define EIGEN_UNROLL_LOOP #endif #endif // EIGEN_MACROS_H diff --git a/externals/eigen/Eigen/src/Core/util/Memory.h b/externals/eigen/Eigen/src/Core/util/Memory.h index c634d7ea..875318cd 100644 --- a/externals/eigen/Eigen/src/Core/util/Memory.h +++ b/externals/eigen/Eigen/src/Core/util/Memory.h @@ -63,14 +63,28 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() { #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // new int[huge]; + #else + void* unused = ::operator new(huge); + EIGEN_UNUSED_VARIABLE(unused); + #endif #endif } @@ -83,19 +97,26 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -inline void* handmade_aligned_malloc(std::size_t size) +EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); + eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); + + EIGEN_USING_STD(malloc) + void *original = malloc(size+alignment); + if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); *(reinterpret_cast(aligned) - 1) = original; return aligned; } /** \internal Frees memory allocated with handmade_aligned_malloc */ -inline void handmade_aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { - if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); + if (ptr) { + EIGEN_USING_STD(free) + free(*(reinterpret_cast(ptr) - 1)); + } } /** \internal @@ -114,7 +135,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); - + *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -142,7 +163,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } -#else +#else EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif @@ -156,9 +177,12 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - result = std::malloc(size); + + EIGEN_USING_STD(malloc) + result = malloc(size); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 - eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); #endif #else result = handmade_aligned_malloc(size); @@ -174,7 +198,10 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - std::free(ptr); + + EIGEN_USING_STD(free) + free(ptr); + #else handmade_aligned_free(ptr); #endif @@ -187,7 +214,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) */ inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) { - EIGEN_UNUSED_VARIABLE(old_size); + EIGEN_UNUSED_VARIABLE(old_size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED @@ -218,7 +245,9 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); - void *result = std::malloc(size); + EIGEN_USING_STD(malloc) + void *result = malloc(size); + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +261,8 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { - std::free(ptr); + EIGEN_USING_STD(free) + free(ptr); } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -331,7 +361,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); - aligned_free(ptr); + Eigen::internal::aligned_free(ptr); } /** \internal Deletes objects constructed with conditional_aligned_new @@ -471,8 +501,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index } /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size - */ -template + */ +template inline Index first_multiple(Index size, Index base) { return ((size+base-1)/base)*base; @@ -493,6 +523,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); + EIGEN_USING_STD(memcpy) memcpy(target, start, size); } }; @@ -502,7 +533,7 @@ template struct smart_copy_helper { { std::copy(start, end, target); } }; -// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. +// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template struct smart_memmove_helper; template void smart_memmove(const T* start, const T* end, T* target) @@ -522,19 +553,30 @@ template struct smart_memmove_helper { template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { + { if (UIntPtr(target) < UIntPtr(start)) { std::copy(start, end, target); } - else + else { std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); - std::copy_backward(start, end, target + count); + std::copy_backward(start, end, target + count); } } }; +#if EIGEN_HAS_RVALUE_REFERENCES +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::move(start, end, target); +} +#else +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::copy(start, end, target); +} +#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** @@ -542,7 +584,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -550,6 +592,15 @@ template struct smart_memmove_helper { #endif #endif +// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is +// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because +// the compiler still emits bad code because stack allocation checks use "<=". +// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 +// is fixed. +#if defined(__clang__) && defined(__thumb__) + #undef EIGEN_ALLOCA +#endif + // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template class aligned_stack_memory_handler : noncopyable @@ -561,12 +612,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) @@ -580,6 +633,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -603,13 +710,15 @@ template void swap(scoped_array &a,scoped_array &b) { std::swap(a.ptr(),b.ptr()); } - + } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -620,9 +729,17 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA - + #if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. @@ -639,13 +756,23 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ Eigen::internal::check_size_for_overflow(SIZE); \ TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) - + + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR) + #endif @@ -653,32 +780,56 @@ template void swap(scoped_array &a,scoped_array &b) *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] *** *****************************************************************************/ -#if EIGEN_MAX_ALIGN_BYTES!=0 +#if EIGEN_HAS_CXX17_OVERALIGN + +// C++17 -> no need to bother about alignment anymore :) + +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) + +#else + +// HIP does not support new/delete on device. +#if EIGEN_MAX_ALIGN_BYTES!=0 && !defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc(size); } \ EIGEN_CATCH (...) { return 0; } \ } #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void *operator new(std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void *operator new[](std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ /* in-place new and delete. since (at least afaik) there is no actual */ \ /* memory allocated we can safely let the default implementation handle */ \ /* this particular case. */ \ + EIGEN_DEVICE_FUNC \ static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \ + EIGEN_DEVICE_FUNC \ static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ + EIGEN_DEVICE_FUNC \ void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ /* nothrow-new (returns zero instead of std::bad_alloc) */ \ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + EIGEN_DEVICE_FUNC \ void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \ Eigen::internal::conditional_aligned_free(ptr); \ } \ @@ -688,20 +839,34 @@ template void swap(scoped_array &a,scoped_array &b) #endif #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool( \ + ((Size)!=Eigen::Dynamic) && \ + (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES )==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) + +#endif /****************************************************************************/ /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code * // Matrix4f requires 16 bytes alignment: -* std::map< int, Matrix4f, std::less, +* std::map< int, Matrix4f, std::less, * aligned_allocator > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; @@ -736,6 +901,15 @@ class aligned_allocator : public std::allocator ~aligned_allocator() {} + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // In gcc std::allocator::max_size() is bugged making gcc triggers a warning: + // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + size_type max_size() const { + return (std::numeric_limits::max)()/sizeof(T); + } + #endif + pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); @@ -898,20 +1072,32 @@ inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) { if(max_std_funcs>=4) queryCacheSizes_intel_direct(l1,l2,l3); - else + else if(max_std_funcs>=2) queryCacheSizes_intel_codes(l1,l2,l3); + else + l1 = l2 = l3 = 0; } inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) { int abcd[4]; abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000005,0); - l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB - abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; - EIGEN_CPUID(abcd,0x80000006,0); - l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB - l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + + // First query the max supported function. + EIGEN_CPUID(abcd,0x80000000,0); + if(static_cast(abcd[0]) >= static_cast(0x80000006)) + { + EIGEN_CPUID(abcd,0x80000005,0); + l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x80000006,0); + l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB + l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB + } + else + { + l1 = l2 = l3 = 0; + } } #endif @@ -927,7 +1113,7 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) // identify the CPU vendor EIGEN_CPUID(abcd,0x0,0); - int max_std_funcs = abcd[1]; + int max_std_funcs = abcd[0]; if(cpuid_is_vendor(abcd,GenuineIntel)) queryCacheSizes_intel(l1,l2,l3,max_std_funcs); else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) diff --git a/externals/eigen/Eigen/src/Core/util/Meta.h b/externals/eigen/Eigen/src/Core/util/Meta.h index 7f637075..81ae2a32 100644 --- a/externals/eigen/Eigen/src/Core/util/Meta.h +++ b/externals/eigen/Eigen/src/Core/util/Meta.h @@ -11,13 +11,54 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) -#include -#include +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + #endif -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +// Recent versions of ICC require for pointer types below. +#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11) + +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT #include +namespace Eigen { +namespace numext { +typedef std::uint8_t uint8_t; +typedef std::int8_t int8_t; +typedef std::uint16_t uint16_t; +typedef std::int16_t int16_t; +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provide the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint8_t uint8_t; +typedef ::int8_t int8_t; +typedef ::uint16_t uint16_t; +typedef ::int16_t int16_t; +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} #endif namespace Eigen { @@ -43,26 +84,33 @@ namespace internal { // Only recent versions of ICC complain about using ptrdiff_t to hold pointers, // and older versions do not provide *intptr_t types. -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +#if EIGEN_ICC_NEEDS_CSTDINT typedef std::intptr_t IntPtr; typedef std::uintptr_t UIntPtr; #else typedef std::ptrdiff_t IntPtr; typedef std::size_t UIntPtr; #endif +#undef EIGEN_ICC_NEEDS_CSTDINT struct true_type { enum { value = 1 }; }; struct false_type { enum { value = 0 }; }; +template +struct bool_constant; + +template<> +struct bool_constant : true_type {}; + +template<> +struct bool_constant : false_type {}; + template struct conditional { typedef Then type; }; template struct conditional { typedef Else type; }; -template struct is_same { enum { value = 0 }; }; -template struct is_same { enum { value = 1 }; }; - template struct remove_reference { typedef T type; }; template struct remove_reference { typedef T type; }; @@ -97,17 +145,65 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; + +template< class T > +struct is_void : is_same::type> {}; + +#if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +using std::is_integral; +#else +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#endif +#endif + +#if EIGEN_HAS_CXX11 +using std::make_unsigned; +#else +// TODO: Possibly improve this implementation of make_unsigned. +// It is currently used only by +// template struct random_default_impl. +template struct make_unsigned; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +#if EIGEN_COMP_MSVC +template<> struct make_unsigned { typedef unsigned __int64 type; }; +template<> struct make_unsigned { typedef unsigned __int64 type; }; +#endif + +// Some platforms define int64_t as `long long` even for C++03, where +// `long long` is not guaranteed by the standard. In this case we are missing +// the definition for make_unsigned. If we just define it, we run into issues +// where `long long` doesn't exist in some compilers for C++03. We therefore add +// the specialization for these platforms only. +#if EIGEN_OS_MAC || EIGEN_COMP_MINGW +template<> struct make_unsigned { typedef unsigned long long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; +#endif +#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -121,6 +217,11 @@ template struct add_const_on_value_type { typedef T const template struct add_const_on_value_type { typedef T const* const type; }; template struct add_const_on_value_type { typedef T const* const type; }; +#if EIGEN_HAS_CXX11 + +using std::is_convertible; + +#else template struct is_convertible_impl @@ -134,16 +235,19 @@ struct is_convertible_impl struct yes {int a[1];}; struct no {int a[2];}; - static yes test(const To&, int); + template + static yes test(T, int); + + template static no test(any_conversion, ...); public: - static From ms_from; + static typename internal::remove_reference::type* ms_from; #ifdef __INTEL_COMPILER #pragma warning push #pragma warning ( disable : 2259 ) #endif - enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; + enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; #ifdef __INTEL_COMPILER #pragma warning pop #endif @@ -152,10 +256,17 @@ struct is_convertible_impl template struct is_convertible { - enum { value = is_convertible_impl::type, - typename remove_all::type>::value }; + enum { value = is_convertible_impl::value }; }; +template +struct is_convertible { enum { value = false }; }; + +template +struct is_convertible { enum { value = true }; }; + +#endif + /** \internal Allows to enable/disable an overload * according to a compile time condition. */ @@ -164,7 +275,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -175,7 +286,7 @@ namespace device { template struct numeric_limits { EIGEN_DEVICE_FUNC - static T epsilon() { return 0; } + static EIGEN_CONSTEXPR T epsilon() { return 0; } static T (max)() { assert(false && "Highest not supported for this type"); } static T (min)() { assert(false && "Lowest not supported for this type"); } static T infinity() { assert(false && "Infinity not supported for this type"); } @@ -183,91 +294,130 @@ template struct numeric_limits }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } - EIGEN_DEVICE_FUNC + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double epsilon() { return __DBL_EPSILON__; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (max)() { return DBL_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (max)() { return INT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static int (min)() { return INT_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (max)() { return UINT_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned int (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (max)() { return LONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long (min)() { return LONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (max)() { return ULONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long (min)() { return 0; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (max)() { return LLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static long long (min)() { return LLONG_MIN; } }; template<> struct numeric_limits { - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (max)() { return ULLONG_MAX; } - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static unsigned long long (min)() { return 0; } }; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool epsilon() { return false; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (max)() { return true; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static bool (min)() { return false; } +}; } -#endif +#endif // defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { @@ -279,13 +429,82 @@ class noncopyable }; /** \internal - * Convenient struct to get the result type of a unary or binary functor. + * Provides access to the number of elements in the object of as a compile-time constant expression. + * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). + * + * Similar to std::tuple_size, but more general. + * + * It currently supports: + * - any types T defining T::SizeAtCompileTime + * - plain C arrays as T[N] + * - std::array (c++11) + * - some internal types such as SingleRange and AllRange * - * It supports both the current STL mechanism (using the result_type member) as well as - * upcoming next STL generation (using a templated result member). - * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack. + * The second template parameter eases SFINAE-based specializations. */ -#if EIGEN_HAS_STD_RESULT_OF +template struct array_size { + enum { value = Dynamic }; +}; + +template struct array_size::type> { + enum { value = T::SizeAtCompileTime }; +}; + +template struct array_size { + enum { value = N }; +}; +template struct array_size { + enum { value = N }; +}; + +#if EIGEN_HAS_CXX11 +template struct array_size > { + enum { value = N }; +}; +template struct array_size > { + enum { value = N }; +}; +#endif + +/** \internal + * Analogue of the std::size free function. + * It returns the size of the container or view \a x of type \c T + * + * It currently supports: + * - any types T defining a member T::size() const + * - plain C arrays as T[N] + * + */ +template +EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } + +template +EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } + +/** \internal + * Convenient struct to get the result type of a nullary, unary, binary, or + * ternary functor. + * + * Pre C++11: + * Supports both a Func::result_type member and templated + * Func::result::type member. + * + * If none of these members is provided, then the type of the first + * argument is returned. + * + * Post C++11: + * This uses std::result_of. However, note the `type` member removes + * const and converts references/pointers to their corresponding value type. + */ +#if EIGEN_HAS_STD_INVOKE_RESULT +template struct result_of; + +template +struct result_of { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_STD_RESULT_OF template struct result_of { typedef typename std::result_of::type type1; typedef typename remove_all::type type; @@ -297,6 +516,28 @@ struct has_none {int a[1];}; struct has_std_result_type {int a[2];}; struct has_tr1_result {int a[3];}; +template +struct nullary_result_of_select {}; + +template +struct nullary_result_of_select {typedef typename Func::result_type type;}; + +template +struct nullary_result_of_select {typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename nullary_result_of_select::type type; +}; + template struct unary_result_of_select {typedef typename internal::remove_all::type type;}; @@ -366,6 +607,45 @@ struct result_of { enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; typedef typename ternary_result_of_select::type type; }; + +#endif + +#if EIGEN_HAS_STD_INVOKE_RESULT +template +struct invoke_result { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_CXX11 +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; #endif struct meta_yes { char a[1]; }; @@ -375,10 +655,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(typename C::ReturnType const *); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; }; template const T* return_ptr(); @@ -435,20 +715,25 @@ class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? /** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. + * at compile-time. */ -template +template=B)> struct meta_least_common_multiple { enum { ret = meta_least_common_multiple::ret }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; template -struct meta_least_common_multiple +struct meta_least_common_multiple { enum { ret = A*K }; }; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { @@ -461,17 +746,27 @@ template struct scalar_product_traits // typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; // }; +/** \internal Obtains a POD type suitable to use as storage for an object of a size + * of at most Len bytes, aligned as specified by \c Align. + */ +template +struct aligned_storage { + struct type { + EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; + }; +}; + } // end namespace internal namespace numext { - -#if defined(__CUDA_ARCH__) + +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) && !EIGEN_HAS_CXX11 using internal::device::numeric_limits; #else using std::numeric_limits; @@ -480,11 +775,36 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; } +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const X& x,const Y& y) { return x == y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } +#endif + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } +#endif + } // end namespace numext } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h b/externals/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h index 86b60f52..1ce6fd1b 100644 --- a/externals/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/externals/eigen/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -1,4 +1,8 @@ -#ifdef EIGEN_WARNINGS_DISABLED +#ifdef EIGEN_WARNINGS_DISABLED_2 +// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet! +# undef EIGEN_WARNINGS_DISABLED_2 + +#elif defined(EIGEN_WARNINGS_DISABLED) #undef EIGEN_WARNINGS_DISABLED #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS @@ -8,7 +12,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && __GNUC__>=6 + #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif diff --git a/externals/eigen/Eigen/src/Core/util/ReshapedHelper.h b/externals/eigen/Eigen/src/Core/util/ReshapedHelper.h new file mode 100644 index 00000000..41243213 --- /dev/null +++ b/externals/eigen/Eigen/src/Core/util/ReshapedHelper.h @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_RESHAPED_HELPER_H +#define EIGEN_RESHAPED_HELPER_H + +namespace Eigen { + +enum AutoSize_t { AutoSize }; +const int AutoOrder = 2; + +namespace internal { + +template +struct get_compiletime_reshape_size { + enum { value = get_fixed_value::value }; +}; + +template +Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) { + return internal::get_runtime_value(size); +} + +template +struct get_compiletime_reshape_size { + enum { + other_size = get_fixed_value::value, + value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size }; +}; + +inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { + return total/other; +} + +template +struct get_compiletime_reshape_order { + enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_RESHAPED_HELPER_H diff --git a/externals/eigen/Eigen/src/Core/util/StaticAssert.h b/externals/eigen/Eigen/src/Core/util/StaticAssert.h index 983361a4..c45de590 100644 --- a/externals/eigen/Eigen/src/Core/util/StaticAssert.h +++ b/externals/eigen/Eigen/src/Core/util/StaticAssert.h @@ -24,9 +24,10 @@ * */ +#ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT - #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) + #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (EIGEN_COMP_CXXVER >= 11) || (EIGEN_COMP_MSVC >= 1600)) // if native static_assert is enabled, let's use it #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); @@ -44,64 +45,68 @@ struct static_assertion { enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, - OUT_OF_RANGE_ACCESS, - YOU_MADE_A_PROGRAMMING_MISTAKE, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, - NUMERIC_TYPE_MUST_BE_REAL, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, - INVALID_MATRIX_PRODUCT, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, - INVALID_MATRIX_TEMPLATE_PARAMETERS, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, - YOU_ALREADY_SPECIFIED_THIS_STRIDE, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, - THIS_TYPE_IS_NOT_SUPPORTED, - STORAGE_KIND_MUST_MATCH, - STORAGE_INDEX_MUST_MATCH, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, + OUT_OF_RANGE_ACCESS=1, + YOU_MADE_A_PROGRAMMING_MISTAKE=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, + NUMERIC_TYPE_MUST_BE_REAL=1, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, + INVALID_MATRIX_PRODUCT=1, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, + INVALID_MATRIX_TEMPLATE_PARAMETERS=1, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, + YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, + STORAGE_LAYOUT_DOES_NOT_MATCH=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, + THIS_TYPE_IS_NOT_SUPPORTED=1, + STORAGE_KIND_MUST_MATCH=1, + STORAGE_INDEX_MUST_MATCH=1, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, + INVALID_TEMPLATE_PARAMETER=1, + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1, + THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1 }; }; @@ -131,7 +136,7 @@ #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); #endif // EIGEN_NO_STATIC_ASSERT - +#endif // EIGEN_STATIC_ASSERT // static assertion failing if the type \a TYPE is not a vector type #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ @@ -180,7 +185,7 @@ ) #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \ - EIGEN_STATIC_ASSERT(!NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) + EIGEN_STATIC_ASSERT(!Eigen::NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes @@ -190,8 +195,8 @@ YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES) #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \ - EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \ - (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \ + EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \ + (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \ THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS) #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \ diff --git a/externals/eigen/Eigen/src/Core/util/SymbolicIndex.h b/externals/eigen/Eigen/src/Core/util/SymbolicIndex.h new file mode 100644 index 00000000..354dd9ad --- /dev/null +++ b/externals/eigen/Eigen/src/Core/util/SymbolicIndex.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SYMBOLIC_INDEX_H +#define EIGEN_SYMBOLIC_INDEX_H + +namespace Eigen { + +/** \namespace Eigen::symbolic + * \ingroup Core_Module + * + * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index. + * Here is a simple example: + * + * \code + * // First step, defines symbols: + * struct x_tag {}; static const symbolic::SymbolExpr x; + * struct y_tag {}; static const symbolic::SymbolExpr y; + * struct z_tag {}; static const symbolic::SymbolExpr z; + * + * // Defines an expression: + * auto expr = (x+3)/y+z; + * + * // And evaluate it: (c++14) + * std::cout << expr.eval(x=6,y=3,z=-13) << "\n"; + * + * // In c++98/11, only one symbol per expression is supported for now: + * auto expr98 = (3-x)/2; + * std::cout << expr98.eval(x=6) << "\n"; + * \endcode + * + * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. + * + */ +namespace symbolic { + +template class Symbol; +template class NegateExpr; +template class AddExpr; +template class ProductExpr; +template class QuotientExpr; + +// A simple wrapper around an integral value to provide the eval method. +// We could also use a free-function symbolic_eval... +template +class ValueExpr { +public: + ValueExpr(IndexType val) : m_value(val) {} + template + IndexType eval_impl(const T&) const { return m_value; } +protected: + IndexType m_value; +}; + +// Specialization for compile-time value, +// It is similar to ValueExpr(N) but this version helps the compiler to generate better code. +template +class ValueExpr > { +public: + ValueExpr() {} + template + EIGEN_CONSTEXPR Index eval_impl(const T&) const { return N; } +}; + + +/** \class BaseExpr + * \ingroup Core_Module + * Common base class of any symbolic expressions + */ +template +class BaseExpr +{ +public: + const Derived& derived() const { return *static_cast(this); } + + /** Evaluate the expression given the \a values of the symbols. + * + * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue + * as constructed by SymbolExpr::operator= operator. + * + */ + template + Index eval(const T& values) const { return derived().eval_impl(values); } + +#if EIGEN_HAS_CXX14 + template + Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); } +#endif + + NegateExpr operator-() const { return NegateExpr(derived()); } + + AddExpr > operator+(Index b) const + { return AddExpr >(derived(), b); } + AddExpr > operator-(Index a) const + { return AddExpr >(derived(), -a); } + ProductExpr > operator*(Index a) const + { return ProductExpr >(derived(),a); } + QuotientExpr > operator/(Index a) const + { return QuotientExpr >(derived(),a); } + + friend AddExpr > operator+(Index a, const BaseExpr& b) + { return AddExpr >(b.derived(), a); } + friend AddExpr,ValueExpr<> > operator-(Index a, const BaseExpr& b) + { return AddExpr,ValueExpr<> >(-b.derived(), a); } + friend ProductExpr,Derived> operator*(Index a, const BaseExpr& b) + { return ProductExpr,Derived>(a,b.derived()); } + friend QuotientExpr,Derived> operator/(Index a, const BaseExpr& b) + { return QuotientExpr,Derived>(a,b.derived()); } + + template + AddExpr > > operator+(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt, const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt, const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt, const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt, const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } + +#if (!EIGEN_HAS_CXX14) + template + AddExpr > > operator+(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + AddExpr > > operator-(internal::FixedInt (*)()) const + { return AddExpr > >(derived(), ValueExpr >()); } + template + ProductExpr > > operator*(internal::FixedInt (*)()) const + { return ProductExpr > >(derived(),ValueExpr >()); } + template + QuotientExpr > > operator/(internal::FixedInt (*)()) const + { return QuotientExpr > >(derived(),ValueExpr >()); } + + template + friend AddExpr > > operator+(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr > >(b.derived(), ValueExpr >()); } + template + friend AddExpr,ValueExpr > > operator-(internal::FixedInt (*)(), const BaseExpr& b) + { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } + template + friend ProductExpr >,Derived> operator*(internal::FixedInt (*)(), const BaseExpr& b) + { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } + template + friend QuotientExpr >,Derived> operator/(internal::FixedInt (*)(), const BaseExpr& b) + { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } +#endif + + + template + AddExpr operator+(const BaseExpr &b) const + { return AddExpr(derived(), b.derived()); } + + template + AddExpr > operator-(const BaseExpr &b) const + { return AddExpr >(derived(), -b.derived()); } + + template + ProductExpr operator*(const BaseExpr &b) const + { return ProductExpr(derived(), b.derived()); } + + template + QuotientExpr operator/(const BaseExpr &b) const + { return QuotientExpr(derived(), b.derived()); } +}; + +template +struct is_symbolic { + // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr. + enum { value = internal::is_convertible >::value }; +}; + +/** Represents the actual value of a symbol identified by its tag + * + * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used. + */ +template +class SymbolValue +{ +public: + /** Default constructor from the value \a val */ + SymbolValue(Index val) : m_value(val) {} + + /** \returns the stored value of the symbol */ + Index value() const { return m_value; } +protected: + Index m_value; +}; + +/** Expression of a symbol uniquely identified by the template parameter type \c tag */ +template +class SymbolExpr : public BaseExpr > +{ +public: + /** Alias to the template parameter \c tag */ + typedef tag Tag; + + SymbolExpr() {} + + /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag. + * + * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value. + */ + SymbolValue operator=(Index val) const { + return SymbolValue(val); + } + + Index eval_impl(const SymbolValue &values) const { return values.value(); } + +#if EIGEN_HAS_CXX14 + // C++14 versions suitable for multiple symbols + template + Index eval_impl(const std::tuple& values) const { return std::get >(values).value(); } +#endif +}; + +template +class NegateExpr : public BaseExpr > +{ +public: + NegateExpr(const Arg0& arg0) : m_arg0(arg0) {} + + template + Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); } +protected: + Arg0 m_arg0; +}; + +template +class AddExpr : public BaseExpr > +{ +public: + AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class ProductExpr : public BaseExpr > +{ +public: + ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +template +class QuotientExpr : public BaseExpr > +{ +public: + QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} + + template + Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); } +protected: + Arg0 m_arg0; + Arg1 m_arg1; +}; + +} // end namespace symbolic + +} // end namespace Eigen + +#endif // EIGEN_SYMBOLIC_INDEX_H diff --git a/externals/eigen/Eigen/src/Core/util/XprHelper.h b/externals/eigen/Eigen/src/Core/util/XprHelper.h index ba5bd186..71c32b8a 100644 --- a/externals/eigen/Eigen/src/Core/util/XprHelper.h +++ b/externals/eigen/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,26 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template struct is_valid_index_type +{ + enum { value = +#if EIGEN_HAS_TYPE_TRAITS + internal::is_integral::value || std::is_enum::value +#elif EIGEN_COMP_MSVC + internal::is_integral::value || __is_enum(T) +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value && !internal::is_same::value && !is_same::value +#endif + }; +}; + +// true if both types are not valid index types +template +struct valid_indexed_view_overload { + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar @@ -90,6 +110,9 @@ class no_assignment_operator { private: no_assignment_operator& operator=(const no_assignment_operator&); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator) }; /** \internal return the index type with the largest number of bits */ @@ -106,19 +129,23 @@ struct promote_index_type template class variable_if_dynamic { public: - EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + operator T() const { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T v) const { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } }; template class variable_if_dynamic { T m_value; - EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); } public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) EIGEN_NO_THROW : m_value(value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } }; @@ -129,8 +156,10 @@ template class variable_if_dynamicindex public: EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR + T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void setValue(T) {} }; template class variable_if_dynamicindex @@ -155,16 +184,7 @@ template struct functor_traits template struct packet_traits; -template struct unpacket_traits -{ - typedef T type; - typedef T half; - enum - { - size = 1, - alignment = 1 - }; -}; +template struct unpacket_traits; template::size)==0 || is_same::half>::value> @@ -383,7 +403,7 @@ template struct plain_matrix_type_row_major typedef Matrix::Scalar, Rows, Cols, - (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, + (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor, MaxRows, MaxCols > type; @@ -400,7 +420,7 @@ struct ref_selector T const&, const T >::type type; - + typedef typename conditional< bool(traits::Flags & NestByRefBit), T &, @@ -438,7 +458,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -579,14 +599,14 @@ template MatrixRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType; typedef Array ArrayRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType; typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixRowType, - ArrayRowType + ArrayRowType >::type type; }; @@ -601,7 +621,7 @@ struct plain_col_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixColType, - ArrayColType + ArrayColType >::type type; }; @@ -617,7 +637,7 @@ struct plain_diag_type typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, MatrixDiagType, - ArrayDiagType + ArrayDiagType >::type type; }; @@ -654,24 +674,39 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; + +template struct is_identity +{ enum { value = false }; }; + +template struct is_identity, T> > +{ enum { value = true }; }; + + template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; +}; + +template +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; } // Internal helper defining the cost of a scalar division for the type T. // The default heuristic can be specialized for each scalar type and architecture. -template +template struct scalar_div_cost { enum { value = 8*NumTraits::MulCost }; }; @@ -718,7 +753,7 @@ std::string demangle_flags(int f) if(f&DirectAccessBit) res += " | Direct"; if(f&NestByRefBit) res += " | NestByRef"; if(f&NoPreferredStorageOrderBit) res += " | NoPreferredStorageOrderBit"; - + return res; } #endif @@ -815,7 +850,7 @@ struct ScalarBinaryOpTraits #define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \ EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType >::value), \ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - + } // end namespace Eigen #endif // EIGEN_XPRHELPER_H diff --git a/externals/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/externals/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06..081e918f 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/externals/eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/externals/eigen/Eigen/src/Eigenvalues/ComplexSchur.h b/externals/eigen/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919f..fc71468f 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/externals/eigen/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -300,10 +300,13 @@ typename ComplexSchur::ComplexScalar ComplexSchur::compu ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1); ComplexScalar eival1 = (trace + disc) / RealScalar(2); ComplexScalar eival2 = (trace - disc) / RealScalar(2); - - if(numext::norm1(eival1) > numext::norm1(eival2)) + RealScalar eival1_norm = numext::norm1(eival1); + RealScalar eival2_norm = numext::norm1(eival2); + // A division by zero can only occur if eival1==eival2==0. + // In this case, det==0, and all we have to do is checking that eival2_norm!=0 + if(eival1_norm > eival2_norm) eival2 = det / eival1; - else + else if(eival2_norm!=RealScalar(0)) eival1 = det / eival2; // choose the eigenvalue closest to the bottom entry of the diagonal diff --git a/externals/eigen/Eigen/src/Eigenvalues/EigenSolver.h b/externals/eigen/Eigen/src/Eigenvalues/EigenSolver.h index f205b185..572b29e4 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/EigenSolver.h +++ b/externals/eigen/Eigen/src/Eigenvalues/EigenSolver.h @@ -110,7 +110,7 @@ template class EigenSolver * * \sa compute() for an example. */ - EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {} + EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {} /** \brief Default constructor with memory preallocation * @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/externals/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/externals/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 36a91dff..87d789b3 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/externals/eigen/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -311,7 +311,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp // Aliases: Map v(reinterpret_cast(m_tmp.data()), size); ComplexVectorType &cv = m_tmp; - const MatrixType &mZ = m_realQZ.matrixZ(); const MatrixType &mS = m_realQZ.matrixS(); const MatrixType &mT = m_realQZ.matrixT(); @@ -351,7 +350,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } } } - m_eivec.col(i).real().noalias() = mZ.transpose() * v; + m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v; m_eivec.col(i).real().normalize(); m_eivec.col(i).imag().setConstant(0); } @@ -400,7 +399,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp / (alpha*mT.coeffRef(j,j) - static_cast(beta*mS.coeffRef(j,j))); } } - m_eivec.col(i+1).noalias() = (mZ.transpose() * cv); + m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv); m_eivec.col(i+1).normalize(); m_eivec.col(i) = m_eivec.col(i+1).conjugate(); } diff --git a/externals/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/externals/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb828..d0f9091b 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/externals/eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/externals/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/externals/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h index f647f69b..1f211393 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/externals/eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -267,7 +267,7 @@ template class HessenbergDecomposition private: - typedef Matrix VectorType; + typedef Matrix VectorType; typedef typename NumTraits::Real RealScalar; static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp); @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/externals/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/externals/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0..66e5a3db 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/externals/eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -66,7 +66,6 @@ template inline typename MatrixBase::EigenvaluesReturnType MatrixBase::eigenvalues() const { - typedef typename internal::traits::Scalar Scalar; return internal::eigenvalues_selector::IsComplex>::run(derived()); } @@ -85,10 +84,9 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { - typedef typename SelfAdjointView::PlainObject PlainObject; PlainObject thisAsMatrix(*this); return SelfAdjointEigenSolver(thisAsMatrix, false).eigenvalues(); } @@ -149,7 +147,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/externals/eigen/Eigen/src/Eigenvalues/RealQZ.h b/externals/eigen/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd..50913018 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/RealQZ.h +++ b/externals/eigen/Eigen/src/Eigenvalues/RealQZ.h @@ -90,8 +90,9 @@ namespace Eigen { m_Z(size, size), m_workspace(size*2), m_maxIters(400), - m_isInitialized(false) - { } + m_isInitialized(false), + m_computeQZ(true) + {} /** \brief Constructor; computes real QZ decomposition of given matrices * @@ -108,9 +109,11 @@ namespace Eigen { m_Z(A.rows(),A.cols()), m_workspace(A.rows()*2), m_maxIters(400), - m_isInitialized(false) { - compute(A, B, computeQZ); - } + m_isInitialized(false), + m_computeQZ(true) + { + compute(A, B, computeQZ); + } /** \brief Returns matrix Q in the QZ decomposition. * @@ -161,7 +164,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/externals/eigen/Eigen/src/Eigenvalues/RealSchur.h b/externals/eigen/Eigen/src/Eigenvalues/RealSchur.h index f5c86041..7304ef34 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/RealSchur.h +++ b/externals/eigen/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -236,7 +236,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu); + Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -270,8 +270,13 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -284,13 +289,13 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa using std::abs; m_matT = matrixH; - if(computeU) + m_workspaceVector.resize(m_matT.cols()); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows(); - m_workspaceVector.resize(m_matT.cols()); Scalar* workspace = &m_workspaceVector.coeffRef(0); // The matrix m_matT is divided in three parts. @@ -302,12 +307,16 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Index totalIter = 0; // iteration count for whole matrix Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); + // sub-diagonal entries smaller than considerAsZero will be treated as zero. + // We use eps^2 to enable more precision in small eigenvalues. + Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), + (std::numeric_limits::min)() ); - if(norm!=0) + if(norm!=Scalar(0)) { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu); + Index il = findSmallSubdiagEntry(iu,considerAsZero); // Check for convergence if (il == iu) // One root found @@ -327,7 +336,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa else // No convergence yet { // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG ) - Vector3s firstHouseholderVector(0,0,0), shiftInfo; + Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo; computeShift(iu, iter, exshift, shiftInfo); iter = iter + 1; totalIter = totalIter + 1; @@ -364,14 +373,17 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline Index RealSchur::findSmallSubdiagEntry(Index iu) +inline Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) + + s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); + + if (abs(m_matT.coeff(res,res-1)) <= s) break; res--; } diff --git a/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9ddd553f..14692365 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -42,10 +44,14 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag * \f$ v \f$ such that \f$ Av = \lambda v \f$. The eigenvalues of a * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the - * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint - * matrices, the matrix \f$ V \f$ is always invertible). This is called the + * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the * eigendecomposition. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * The algorithm exploits the fact that the matrix is selfadjoint, making it * faster and more accurate than the general purpose eigenvalue algorithms * implemented in EigenSolver and ComplexEigenSolver. @@ -119,7 +125,10 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), - m_isInitialized(false) + m_hcoeffs(), + m_info(InvalidInput), + m_isInitialized(false), + m_eigenvectorsOk(false) { } /** \brief Constructor, pre-allocates memory for dynamic-size matrices. @@ -139,7 +148,9 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), - m_isInitialized(false) + m_hcoeffs(size > 1 ? size - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) {} /** \brief Constructor; computes eigendecomposition of given matrix. @@ -163,7 +174,9 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), - m_isInitialized(false) + m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), + m_isInitialized(false), + m_eigenvectorsOk(false) { compute(matrix.derived(), options); } @@ -250,6 +263,11 @@ template class SelfAdjointEigenSolver * matrix \f$ A \f$, then the matrix returned by this function is the * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$. * + * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal + * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then + * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is + * equal to its transpose, \f$ V^{-1} = V^T \f$. + * * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out * @@ -337,7 +355,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const @@ -354,7 +372,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -362,6 +381,7 @@ template class SelfAdjointEigenSolver EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; + typename TridiagonalizationType::CoeffVectorType m_hcoeffs; ComputationInfo m_info; bool m_isInitialized; bool m_eigenvectorsOk; @@ -403,7 +423,7 @@ ::compute(const EigenBase& a_matrix, int options) const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -434,7 +454,8 @@ ::compute(const EigenBase& a_matrix, int options) if(scale==RealScalar(0)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); - internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors); + m_hcoeffs.resize(n-1); + internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors); m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec); @@ -479,10 +500,9 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; - ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -493,15 +513,23 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag typedef typename DiagType::RealScalar RealScalar; const RealScalar considerAsZero = (std::numeric_limits::min)(); - const RealScalar precision = RealScalar(2)*NumTraits::epsilon(); - + const RealScalar precision_inv = RealScalar(1)/NumTraits::epsilon(); while (end>0) { - for (Index i = start; i0 && subdiag[end-1]==RealScalar(0)) { end--; @@ -535,7 +563,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -566,10 +594,10 @@ template struct direct_selfadjoint_eigenvalues struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD(abs); + EIGEN_USING_STD(sqrt); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -616,8 +645,8 @@ template struct direct_selfadjoint_eigenvaluesn1) res = c0/std::sqrt(n0); - else res = c1/std::sqrt(n1); + if(n0>n1) res = c0/sqrt(n0); + else res = c1/sqrt(n1); return true; } @@ -719,7 +748,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - using std::sqrt; + EIGEN_USING_STD(sqrt); const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; @@ -729,8 +758,8 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void run(SolverType& solver, const MatrixType& mat, int options) { - EIGEN_USING_STD_MATH(sqrt); - EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD(sqrt); + EIGEN_USING_STD(abs); eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 @@ -803,32 +832,38 @@ ::computeDirect(const MatrixType& matrix, int options) } namespace internal { + +// Francis implicit QR step. template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + // Wilkinson Shift. RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still // underflow thus leading to inf/NaN values when using the following commented code: -// RealScalar e2 = numext::abs2(subdiag[end-1]); -// RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); + // RealScalar e2 = numext::abs2(subdiag[end-1]); + // RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2)); // This explain the following, somewhat more complicated, version: RealScalar mu = diag[end]; - if(td==RealScalar(0)) - mu -= abs(e); - else - { - RealScalar e2 = numext::abs2(subdiag[end-1]); - RealScalar h = numext::hypot(td,e); - if(e2==RealScalar(0)) mu -= (e / (td + (td>RealScalar(0) ? RealScalar(1) : RealScalar(-1)))) * (e / h); - else mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + if(td==RealScalar(0)) { + mu -= numext::abs(e); + } else if (e != RealScalar(0)) { + const RealScalar e2 = numext::abs2(e); + const RealScalar h = numext::hypot(td,e); + if(e2 == RealScalar(0)) { + mu -= e / ((td + (td>RealScalar(0) ? h : -h)) / e); + } else { + mu -= e2 / (td + (td>RealScalar(0) ? h : -h)); + } } - + RealScalar x = diag[start] - mu; RealScalar z = subdiag[start]; - for (Index k = start; k < end; ++k) + // If z ever becomes zero, the Givens rotation will be the identity and + // z will stay zero for all future iterations. + for (Index k = start; k < end && z != RealScalar(0); ++k) { JacobiRotation rot; rot.makeGivens(x, z); @@ -841,12 +876,11 @@ static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index sta diag[k+1] = rot.s() * sdk + rot.c() * dkp1; subdiag[k] = rot.c() * sdk - rot.s() * dkp1; - if (k > start) subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z; + // "Chasing the bulge" to return to triangular form. x = subdiag[k]; - if (k < end - 1) { z = -rot.s() * subdiag[k+1]; diff --git a/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index 3891cf88..b0c947dc 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/externals/eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -37,7 +37,7 @@ namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \ +#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \ template<> template inline \ SelfAdjointEigenSolver >& \ SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ @@ -47,7 +47,7 @@ SelfAdjointEigenSolver >::compute(c && (options&EigVecMask)!=EigVecMask \ && "invalid option parameter"); \ bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \ - lapack_int n = internal::convert_index(matrix.cols()), lda, matrix_order, info; \ + lapack_int n = internal::convert_index(matrix.cols()), lda, info; \ m_eivalues.resize(n,1); \ m_subdiag.resize(n-1); \ m_eivec = matrix; \ @@ -63,27 +63,24 @@ SelfAdjointEigenSolver >::compute(c } \ \ lda = internal::convert_index(m_eivec.outerStride()); \ - matrix_order=LAPACKE_COLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ \ - info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ + info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ m_info = (info==0) ? Success : NoConvergence; \ m_isInitialized = true; \ m_eigenvectorsOk = computeEigenvectors; \ return *this; \ } +#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, ColMajor, LAPACK_COL_MAJOR) - -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, RowMajor, LAPACK_ROW_MAJOR) +EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev) +EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev) +EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev) +EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev) } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h b/externals/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17..674c92a3 100644 --- a/externals/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/externals/eigen/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -11,10 +11,10 @@ #ifndef EIGEN_TRIDIAGONALIZATION_H #define EIGEN_TRIDIAGONALIZATION_H -namespace Eigen { +namespace Eigen { namespace internal { - + template struct TridiagonalizationMatrixTReturnType; template struct traits > @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -352,7 +354,7 @@ void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) Index n = matA.rows(); eigen_assert(n==matA.cols()); eigen_assert(n==hCoeffs.size()+1 || n==1); - + for (Index i = 0; i -void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) +template +EIGEN_DEVICE_FUNC +void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, + CoeffVectorType& hcoeffs, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); - tridiagonalization_inplace_selector::run(mat, diag, subdiag, extractQ); + tridiagonalization_inplace_selector::run(mat, diag, subdiag, hcoeffs, extractQ); } /** \internal @@ -439,10 +443,10 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { - CoeffVectorType hCoeffs(mat.cols()-1); - tridiagonalization_inplace(mat,hCoeffs); + tridiagonalization_inplace(mat, hCoeffs); diag = mat.diagonal().real(); subdiag = mat.template diagonal<-1>().real(); if(extractQ) @@ -462,8 +466,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + template + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ) { using std::sqrt; const RealScalar tol = (std::numeric_limits::min)(); @@ -507,8 +511,9 @@ struct tridiagonalization_inplace_selector { typedef typename MatrixType::Scalar Scalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + template + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) @@ -542,8 +547,8 @@ template struct TridiagonalizationMatrixTReturnType result.template diagonal<-1>() = m_matrix.template diagonal<-1>(); } - Index rows() const { return m_matrix.rows(); } - Index cols() const { return m_matrix.cols(); } + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } protected: typename MatrixType::Nested m_matrix; diff --git a/externals/eigen/Eigen/src/Geometry/AlignedBox.h b/externals/eigen/Eigen/src/Geometry/AlignedBox.h index 066eae4f..55a9d0ae 100644 --- a/externals/eigen/Eigen/src/Geometry/AlignedBox.h +++ b/externals/eigen/Eigen/src/Geometry/AlignedBox.h @@ -7,10 +7,46 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// Function void Eigen::AlignedBox::transform(const Transform& transform) +// is provided under the following license agreement: +// +// Software License Agreement (BSD License) +// +// Copyright (c) 2011-2014, Willow Garage, Inc. +// Copyright (c) 2014-2015, Open Source Robotics Foundation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Open Source Robotics Foundation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + #ifndef EIGEN_ALIGNEDBOX_H #define EIGEN_ALIGNEDBOX_H -namespace Eigen { +namespace Eigen { /** \geometry_module \ingroup Geometry_Module * @@ -63,7 +99,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) /** Default constructor initializing a null box. */ EIGEN_DEVICE_FUNC inline AlignedBox() - { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); } + { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); } /** Constructs a null box with \a _dim the dimension of the ambient space. */ EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim) @@ -231,7 +267,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); } /** Returns an AlignedBox that is the union of \a b and \c *this. - * \note Merging with an empty box may result in a box bigger than \c *this. + * \note Merging with an empty box may result in a box bigger than \c *this. * \sa extend(const AlignedBox&) */ EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); } @@ -246,6 +282,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) return *this; } + /** \returns a copy of \c *this translated by the vector \a t. */ + template + EIGEN_DEVICE_FUNC inline AlignedBox translated(const MatrixBase& a_t) const + { + AlignedBox result(m_min, m_max); + result.translate(a_t); + return result; + } + /** \returns the squared distance between the point \a p and the box \c *this, * and zero if \a p is inside the box. * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&) @@ -265,14 +310,63 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) */ template EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase& p) const - { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); } + { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p))); } /** \returns the distance between the boxes \a b and \c *this, * and zero if the boxes intersect. * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&) */ EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const - { EIGEN_USING_STD_MATH(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); } + { EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b))); } + + /** + * Specialization of transform for pure translation. + */ + template + EIGEN_DEVICE_FUNC inline void transform( + const typename Transform::TranslationType& translation) + { + this->translate(translation); + } + + /** + * Transforms this box by \a transform and recomputes it to + * still be an axis-aligned box. + * + * \note This method is provided under BSD license (see the top of this file). + */ + template + EIGEN_DEVICE_FUNC inline void transform(const Transform& transform) + { + // Only Affine and Isometry transforms are currently supported. + EIGEN_STATIC_ASSERT(Mode == Affine || Mode == AffineCompact || Mode == Isometry, THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS); + + // Method adapted from FCL src/shape/geometric_shapes_utility.cpp#computeBV(...) + // https://github.com/flexible-collision-library/fcl/blob/fcl-0.4/src/shape/geometric_shapes_utility.cpp#L292 + // + // Here's a nice explanation why it works: https://zeuxcg.org/2010/10/17/aabb-from-obb-with-component-wise-abs/ + + // two times rotated extent + const VectorType rotated_extent_2 = transform.linear().cwiseAbs() * sizes(); + // two times new center + const VectorType rotated_center_2 = transform.linear() * (this->m_max + this->m_min) + + Scalar(2) * transform.translation(); + + this->m_max = (rotated_center_2 + rotated_extent_2) / Scalar(2); + this->m_min = (rotated_center_2 - rotated_extent_2) / Scalar(2); + } + + /** + * \returns a copy of \c *this transformed by \a transform and recomputed to + * still be an axis-aligned box. + */ + template + EIGEN_DEVICE_FUNC AlignedBox transformed(const Transform& transform) const + { + AlignedBox result(m_min, m_max); + result.transform(transform); + return result; + } /** \returns \c *this with scalar type casted to \a NewScalarType * diff --git a/externals/eigen/Eigen/src/Geometry/AngleAxis.h b/externals/eigen/Eigen/src/Geometry/AngleAxis.h index 0af3c1b0..78328b6b 100644 --- a/externals/eigen/Eigen/src/Geometry/AngleAxis.h +++ b/externals/eigen/Eigen/src/Geometry/AngleAxis.h @@ -169,8 +169,8 @@ template template EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const QuaternionBase& q) { - EIGEN_USING_STD_MATH(atan2) - EIGEN_USING_STD_MATH(abs) + EIGEN_USING_STD(atan2) + EIGEN_USING_STD(abs) Scalar n = q.vec().norm(); if(n::epsilon()) n = q.vec().stableNorm(); @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } @@ -217,8 +217,8 @@ template typename AngleAxis::Matrix3 EIGEN_DEVICE_FUNC AngleAxis::toRotationMatrix(void) const { - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) Matrix3 res; Vector3 sin_axis = sin(m_angle) * m_axis; Scalar c = cos(m_angle); diff --git a/externals/eigen/Eigen/src/Geometry/EulerAngles.h b/externals/eigen/Eigen/src/Geometry/EulerAngles.h index c633268a..19b734ca 100644 --- a/externals/eigen/Eigen/src/Geometry/EulerAngles.h +++ b/externals/eigen/Eigen/src/Geometry/EulerAngles.h @@ -36,9 +36,9 @@ template EIGEN_DEVICE_FUNC inline Matrix::Scalar,3,1> MatrixBase::eulerAngles(Index a0, Index a1, Index a2) const { - EIGEN_USING_STD_MATH(atan2) - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(atan2) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) /* Implemented from Graphics Gems IV */ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3) diff --git a/externals/eigen/Eigen/src/Geometry/Homogeneous.h b/externals/eigen/Eigen/src/Geometry/Homogeneous.h index 5f0da1a9..94083ac5 100644 --- a/externals/eigen/Eigen/src/Geometry/Homogeneous.h +++ b/externals/eigen/Eigen/src/Geometry/Homogeneous.h @@ -10,7 +10,7 @@ #ifndef EIGEN_HOMOGENEOUS_H #define EIGEN_HOMOGENEOUS_H -namespace Eigen { +namespace Eigen { /** \geometry_module \ingroup Geometry_Module * @@ -72,9 +72,11 @@ template class Homogeneous : m_matrix(matrix) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical ? 1 : 0); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); } - + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_matrix.rows() + (int(Direction)==Vertical ? 1 : 0); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); } + EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; } template @@ -262,8 +264,10 @@ struct homogeneous_left_product_impl,Lhs> m_rhs(rhs) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const { @@ -300,8 +304,8 @@ struct homogeneous_right_product_impl,Rhs> : m_lhs(lhs), m_rhs(rhs) {} - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } template EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const { @@ -322,7 +326,7 @@ template struct evaluator_traits > { typedef typename storage_kind_to_evaluator_kind::Kind Kind; - typedef HomogeneousShape Shape; + typedef HomogeneousShape Shape; }; template<> struct AssignmentKind { typedef Dense2Dense Kind; }; @@ -414,7 +418,7 @@ struct product_evaluator, ProductTag, Homogeneous typedef typename helper::ConstantBlock ConstantBlock; typedef typename helper::Xpr RefactoredXpr; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base( xpr.lhs().nestedExpression() .lazyProduct( xpr.rhs().template topRows(xpr.lhs().nestedExpression().cols()) ) + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) ) @@ -467,7 +471,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef typename helper::ConstantBlock ConstantBlock; typedef typename helper::Xpr RefactoredXpr; typedef evaluator Base; - + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base( xpr.lhs().template leftCols(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() ) + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) ) diff --git a/externals/eigen/Eigen/src/Geometry/Hyperplane.h b/externals/eigen/Eigen/src/Geometry/Hyperplane.h index 05929b29..cebe0355 100644 --- a/externals/eigen/Eigen/src/Geometry/Hyperplane.h +++ b/externals/eigen/Eigen/src/Geometry/Hyperplane.h @@ -119,7 +119,7 @@ class Hyperplane * If the dimension of the ambient space is greater than 2, then there isn't uniqueness, * so an arbitrary choice is made. */ - // FIXME to be consitent with the rest this could be implemented as a static Through function ?? + // FIXME to be consistent with the rest this could be implemented as a static Through function ?? EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine& parametrized) { normal() = parametrized.direction().unitOrthogonal(); diff --git a/externals/eigen/Eigen/src/Geometry/OrthoMethods.h b/externals/eigen/Eigen/src/Geometry/OrthoMethods.h index a035e631..524aebe1 100644 --- a/externals/eigen/Eigen/src/Geometry/OrthoMethods.h +++ b/externals/eigen/Eigen/src/Geometry/OrthoMethods.h @@ -27,9 +27,10 @@ namespace Eigen { template template #ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_DEVICE_FUNC inline typename MatrixBase::template cross_product_return_type::type +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename MatrixBase::template cross_product_return_type::type #else -inline typename MatrixBase::PlainObject +typename MatrixBase::PlainObject #endif MatrixBase::cross(const MatrixBase& other) const { diff --git a/externals/eigen/Eigen/src/Geometry/ParametrizedLine.h b/externals/eigen/Eigen/src/Geometry/ParametrizedLine.h index 1e985d8c..584f5008 100644 --- a/externals/eigen/Eigen/src/Geometry/ParametrizedLine.h +++ b/externals/eigen/Eigen/src/Geometry/ParametrizedLine.h @@ -87,7 +87,7 @@ class ParametrizedLine /** \returns the distance of a point \a p to its projection onto the line \c *this. * \sa squaredDistance() */ - EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD_MATH(sqrt) return sqrt(squaredDistance(p)); } + EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const { EIGEN_USING_STD(sqrt) return sqrt(squaredDistance(p)); } /** \returns the projection of a point \a p onto the line \c *this. */ EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const @@ -104,7 +104,44 @@ class ParametrizedLine template EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; - /** \returns \c *this with scalar type casted to \a NewScalarType + /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this. + * + * \param mat the Dim x Dim transformation matrix + * \param traits specifies whether the matrix \a mat represents an #Isometry + * or a more generic #Affine transformation. The default is #Affine. + */ + template + EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase& mat, TransformTraits traits = Affine) + { + if (traits==Affine) + direction() = (mat * direction()).normalized(); + else if (traits==Isometry) + direction() = mat * direction(); + else + { + eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()"); + } + origin() = mat * origin(); + return *this; + } + + /** Applies the transformation \a t to \c *this and returns a reference to \c *this. + * + * \param t the transformation of dimension Dim + * \param traits specifies whether the transformation \a t represents an #Isometry + * or a more generic #Affine transformation. The default is #Affine. + * Other kind of transformations are not supported. + */ + template + EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform& t, + TransformTraits traits = Affine) + { + transform(t.linear(), traits); + origin() += t.translation(); + return *this; + } + +/** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. diff --git a/externals/eigen/Eigen/src/Geometry/Quaternion.h b/externals/eigen/Eigen/src/Geometry/Quaternion.h index f6ef1bcf..3259e592 100644 --- a/externals/eigen/Eigen/src/Geometry/Quaternion.h +++ b/externals/eigen/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } @@ -136,7 +141,7 @@ class QuaternionBase : public RotationBase template EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase& other) const; /** \returns an equivalent 3x3 rotation matrix */ - EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix() const; + EIGEN_DEVICE_FUNC inline Matrix3 toRotationMatrix() const; /** \returns the quaternion which transform \a a into \a b through a rotation */ template @@ -153,6 +158,22 @@ class QuaternionBase : public RotationBase template EIGEN_DEVICE_FUNC Quaternion slerp(const Scalar& t, const QuaternionBase& other) const; + /** \returns true if each coefficients of \c *this and \a other are all exactly equal. + * \warning When using floating point scalar values you probably should rather use a + * fuzzy comparison such as isApprox() + * \sa isApprox(), operator!= */ + template + EIGEN_DEVICE_FUNC inline bool operator==(const QuaternionBase& other) const + { return coeffs() == other.coeffs(); } + + /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. + * \warning When using floating point scalar values you probably should rather use a + * fuzzy comparison such as isApprox() + * \sa isApprox(), operator== */ + template + EIGEN_DEVICE_FUNC inline bool operator!=(const QuaternionBase& other) const + { return coeffs() != other.coeffs(); } + /** \returns \c true if \c *this is approximately equal to \a other, within the precision * determined by \a prec. * @@ -164,20 +185,45 @@ class QuaternionBase : public RotationBase /** return the result vector of \a v through the rotation*/ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const; + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. */ template - EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const + EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const; + + #else + + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,const Derived&>::type cast() const { - return typename internal::cast_return_type >::type(derived()); + return derived(); } + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,Quaternion >::type cast() const + { + return Quaternion(coeffs().template cast()); + } + #endif + +#ifndef EIGEN_NO_IO + friend std::ostream& operator<<(std::ostream& s, const QuaternionBase& q) { + s << q.x() << "i + " << q.y() << "j + " << q.z() << "k" << " + " << q.w(); + return s; + } +#endif + #ifdef EIGEN_QUATERNIONBASE_PLUGIN # include EIGEN_QUATERNIONBASE_PLUGIN #endif +protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase) }; /*************************************************************************** @@ -271,6 +317,21 @@ class Quaternion : public QuaternionBase > EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } +#if EIGEN_HAS_RVALUE_REFERENCES + // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. + /** Default move constructor */ + EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) + : m_coeffs(std::move(other.coeffs())) + {} + + /** Default move assignment operator */ + EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) + { + m_coeffs = std::move(other.coeffs()); + return *this; + } +#endif + EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); template @@ -423,7 +484,7 @@ typedef Map, Aligned> QuaternionMapAlignedd; // Generic Quaternion * Quaternion product // This product can be specialized for a given architecture via the Arch template argument. namespace internal { -template struct quat_product +template struct quat_product { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& a, const QuaternionBase& b){ return Quaternion @@ -446,8 +507,7 @@ QuaternionBase::operator* (const QuaternionBase& other) c EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) return internal::quat_product::Scalar, - EIGEN_PLAIN_ENUM_MIN(internal::traits::Alignment, internal::traits::Alignment)>::run(*this, other); + typename internal::traits::Scalar>::run(*this, other); } /** \sa operator*(Quaternion) */ @@ -500,8 +560,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase::operator template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase::operator=(const AngleAxisType& aa) { - EIGEN_USING_STD_MATH(cos) - EIGEN_USING_STD_MATH(sin) + EIGEN_USING_STD(cos) + EIGEN_USING_STD(sin) Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings this->w() = cos(ha); this->vec() = sin(ha) * aa.axis(); @@ -577,7 +637,7 @@ template template EIGEN_DEVICE_FUNC inline Derived& QuaternionBase::setFromTwoVectors(const MatrixBase& a, const MatrixBase& b) { - EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD(sqrt) Vector3 v0 = a.normalized(); Vector3 v1 = b.normalized(); Scalar c = v1.dot(v0); @@ -618,13 +678,13 @@ EIGEN_DEVICE_FUNC inline Derived& QuaternionBase::setFromTwoVectors(con template EIGEN_DEVICE_FUNC Quaternion Quaternion::UnitRandom() { - EIGEN_USING_STD_MATH(sqrt) - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sqrt) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) const Scalar u1 = internal::random(0, 1), u2 = internal::random(0, 2*EIGEN_PI), u3 = internal::random(0, 2*EIGEN_PI); - const Scalar a = sqrt(1 - u1), + const Scalar a = sqrt(Scalar(1) - u1), b = sqrt(u1); return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3)); } @@ -672,7 +732,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> // Generic conjugate of a Quaternion namespace internal { -template struct quat_conj +template struct quat_conj { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& q){ return Quaternion(q.w(),-q.x(),-q.y(),-q.z()); @@ -691,8 +751,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> QuaternionBase::conjugate() const { return internal::quat_conj::Scalar, - internal::traits::Alignment>::run(*this); + typename internal::traits::Scalar>::run(*this); } @@ -704,7 +763,7 @@ template EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar QuaternionBase::angularDistance(const QuaternionBase& other) const { - EIGEN_USING_STD_MATH(atan2) + EIGEN_USING_STD(atan2) Quaternion d = (*this) * other.conjugate(); return Scalar(2) * atan2( d.vec().norm(), numext::abs(d.w()) ); } @@ -722,8 +781,8 @@ template EIGEN_DEVICE_FUNC Quaternion::Scalar> QuaternionBase::slerp(const Scalar& t, const QuaternionBase& other) const { - EIGEN_USING_STD_MATH(acos) - EIGEN_USING_STD_MATH(sin) + EIGEN_USING_STD(acos) + EIGEN_USING_STD(sin) const Scalar one = Scalar(1) - NumTraits::epsilon(); Scalar d = this->dot(other); Scalar absD = numext::abs(d); @@ -760,7 +819,7 @@ struct quaternionbase_assign_impl template EIGEN_DEVICE_FUNC static inline void run(QuaternionBase& q, const Other& a_mat) { const typename internal::nested_eval::type mat(a_mat); - EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD(sqrt) // This algorithm comes from "Quaternion Calculus and Fast Animation", // Ken Shoemake, 1987 SIGGRAPH course notes Scalar t = mat.trace(); diff --git a/externals/eigen/Eigen/src/Geometry/Rotation2D.h b/externals/eigen/Eigen/src/Geometry/Rotation2D.h index 884b7d0e..d0bd5756 100644 --- a/externals/eigen/Eigen/src/Geometry/Rotation2D.h +++ b/externals/eigen/Eigen/src/Geometry/Rotation2D.h @@ -175,7 +175,7 @@ template template EIGEN_DEVICE_FUNC Rotation2D& Rotation2D::fromRotationMatrix(const MatrixBase& mat) { - EIGEN_USING_STD_MATH(atan2) + EIGEN_USING_STD(atan2) EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE) m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0)); return *this; @@ -187,8 +187,8 @@ template typename Rotation2D::Matrix2 EIGEN_DEVICE_FUNC Rotation2D::toRotationMatrix(void) const { - EIGEN_USING_STD_MATH(sin) - EIGEN_USING_STD_MATH(cos) + EIGEN_USING_STD(sin) + EIGEN_USING_STD(cos) Scalar sinA = sin(m_angle); Scalar cosA = cos(m_angle); return (Matrix2() << cosA, -sinA, sinA, cosA).finished(); diff --git a/externals/eigen/Eigen/src/Geometry/Scaling.h b/externals/eigen/Eigen/src/Geometry/Scaling.h index f58ca03d..d352f1f2 100644 --- a/externals/eigen/Eigen/src/Geometry/Scaling.h +++ b/externals/eigen/Eigen/src/Geometry/Scaling.h @@ -14,7 +14,7 @@ namespace Eigen { /** \geometry_module \ingroup Geometry_Module * - * \class Scaling + * \class UniformScaling * * \brief Represents a generic uniform scaling transformation * @@ -29,6 +29,22 @@ namespace Eigen { * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ + +namespace internal +{ + // This helper helps nvcc+MSVC to properly parse this file. + // See bug 1412. + template + struct uniformscaling_times_affine_returntype + { + enum + { + NewMode = int(Mode) == int(Isometry) ? Affine : Mode + }; + typedef Transform type; + }; +} + template class UniformScaling { @@ -60,9 +76,11 @@ class UniformScaling /** Concatenates a uniform scaling and an affine transformation */ template - inline Transform operator* (const Transform& t) const + inline typename + internal::uniformscaling_times_affine_returntype::type + operator* (const Transform& t) const { - Transform res = t; + typename internal::uniformscaling_times_affine_returntype::type res = t; res.prescale(factor()); return res; } @@ -70,7 +88,7 @@ class UniformScaling /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template - inline typename internal::plain_matrix_type::type operator* (const MatrixBase& other) const + inline typename Eigen::internal::plain_matrix_type::type operator* (const MatrixBase& other) const { return other * m_factor; } template @@ -110,7 +128,7 @@ class UniformScaling /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defiend in MatrixBase and not as a friend function +// NOTE this operator is defined in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/externals/eigen/Eigen/src/Geometry/Transform.h b/externals/eigen/Eigen/src/Geometry/Transform.h index 3f31ee45..52b8c2a4 100644 --- a/externals/eigen/Eigen/src/Geometry/Transform.h +++ b/externals/eigen/Eigen/src/Geometry/Transform.h @@ -12,7 +12,7 @@ #ifndef EIGEN_TRANSFORM_H #define EIGEN_TRANSFORM_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -47,7 +47,7 @@ struct transform_left_product_impl; template< typename Lhs, typename Rhs, - bool AnyProjective = + bool AnyProjective = transform_traits::IsProjective || transform_traits::IsProjective> struct transform_transform_product_impl; @@ -97,6 +97,9 @@ template struct transform_make_affine; * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. * - #Projective: the transformation is stored as a (Dim+1)^2 matrix * without any assumption. + * - #Isometry: same as #Affine with the additional assumption that + * the linear part represents a rotation. This assumption is exploited + * to speed up some functions such as inverse() and rotation(). * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * @@ -115,7 +118,7 @@ template struct transform_make_affine; * \end{array} \right) \f$ * * Note that for a projective transformation the last row can be anything, - * and then the interpretation of different parts might be sightly different. + * and then the interpretation of different parts might be slightly different. * * However, unlike a plain matrix, the Transform class provides many features * simplifying both its assembly and usage. In particular, it can be composed @@ -220,9 +223,9 @@ class Transform /** type of the matrix used to represent the linear part of the transformation */ typedef Matrix LinearMatrixType; /** type of read/write reference to the linear part of the transformation */ - typedef Block LinearPart; + typedef Block LinearPart; /** type of read reference to the linear part of the transformation */ - typedef const Block ConstLinearPart; + typedef const Block ConstLinearPart; /** type of read/write reference to the affine part of the transformation */ typedef typename internal::conditional::Flags & RowMajorBit)> ConstTranslationPart; /** corresponding translation type */ typedef Translation TranslationType; - + // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0 enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) }; /** The return type of the product between a diagonal matrix and a transform */ @@ -252,17 +255,11 @@ class Transform public: /** Default constructor without initialization of the meaningful coefficients. - * If Mode==Affine, then the last row is set to [0 ... 0 1] */ + * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */ EIGEN_DEVICE_FUNC inline Transform() { check_template_params(); - internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); - } - - EIGEN_DEVICE_FUNC inline Transform(const Transform& other) - { - check_template_params(); - m_matrix = other.m_matrix; + internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t) @@ -282,9 +279,6 @@ class Transform *this = r; } - EIGEN_DEVICE_FUNC inline Transform& operator=(const Transform& other) - { m_matrix = other.m_matrix; return *this; } - typedef internal::transform_take_affine_part take_affine_part; /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */ @@ -308,7 +302,7 @@ class Transform internal::transform_construct_from_matrix::run(this, other.derived()); return *this; } - + template EIGEN_DEVICE_FUNC inline Transform(const Transform& other) { @@ -335,7 +329,7 @@ class Transform OtherModeIsAffineCompact = OtherMode == int(AffineCompact) }; - if(ModeIsAffineCompact == OtherModeIsAffineCompact) + if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact)) { // We need the block expression because the code is compiled for all // combinations of transformations and will trigger a compile time error @@ -343,7 +337,7 @@ class Transform m_matrix.template block(0,0) = other.matrix().template block(0,0); makeAffine(); } - else if(OtherModeIsAffineCompact) + else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact)) { typedef typename Transform::MatrixType OtherMatrixType; internal::transform_construct_from_matrix::run(this, other.matrix()); @@ -380,9 +374,9 @@ class Transform inline Transform& operator=(const QTransform& other); inline QTransform toQTransform(void) const; #endif - - EIGEN_DEVICE_FUNC Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); } - EIGEN_DEVICE_FUNC Index cols() const { return m_matrix.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_matrix.cols(); } /** shortcut for m_matrix(row,col); * \sa MatrixBase::operator(Index,Index) const */ @@ -456,7 +450,7 @@ class Transform /** \returns The product expression of a transform \a a times a diagonal matrix \a b * * The rhs diagonal matrix is interpreted as an affine scaling transformation. The - * product results in a Transform of the same type (mode) as the lhs only if the lhs + * product results in a Transform of the same type (mode) as the lhs only if the lhs * mode is no isometry. In that case, the returned transform is an affinity. */ template @@ -471,7 +465,7 @@ class Transform /** \returns The product expression of a diagonal matrix \a a times a transform \a b * * The lhs diagonal matrix is interpreted as an affine scaling transformation. The - * product results in a Transform of the same type (mode) as the lhs only if the lhs + * product results in a Transform of the same type (mode) as the lhs only if the lhs * mode is no isometry. In that case, the returned transform is an affinity. */ template @@ -481,7 +475,7 @@ class Transform TransformTimeDiagonalReturnType res; res.linear().noalias() = a*b.linear(); res.translation().noalias() = a*b.translation(); - if (Mode!=int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact))) res.matrix().row(Dim) = b.matrix().row(Dim); return res; } @@ -494,7 +488,7 @@ class Transform { return internal::transform_transform_product_impl::run(*this,other); } - + #if EIGEN_COMP_ICC private: // this intermediate structure permits to workaround a bug in ICC 11: @@ -503,13 +497,13 @@ class Transform // (the meaning of a name may have changed since the template declaration -- the type of the template is: // "Eigen::internal::transform_transform_product_impl, // Eigen::Transform, >::ResultType (const Eigen::Transform &) const") - // + // template struct icc_11_workaround { typedef internal::transform_transform_product_impl > ProductType; typedef typename ProductType::ResultType ResultType; }; - + public: /** Concatenates two different transformations */ template @@ -542,7 +536,7 @@ class Transform } template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC inline Transform& scale(const MatrixBase &other); template @@ -572,18 +566,18 @@ class Transform EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy); EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t); - + EIGEN_DEVICE_FUNC inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); } - + EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC inline Transform& operator=(const UniformScaling& t); - + EIGEN_DEVICE_FUNC inline Transform& operator*=(const UniformScaling& s) { return scale(s.factor()); } - + EIGEN_DEVICE_FUNC inline TransformTimeDiagonalReturnType operator*(const UniformScaling& s) const { @@ -602,7 +596,9 @@ class Transform template EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase& r) const; - EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const; + typedef typename internal::conditional::type RotationReturnType; + EIGEN_DEVICE_FUNC RotationReturnType rotation() const; + template EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const; @@ -684,7 +680,7 @@ class Transform #ifdef EIGEN_TRANSFORM_PLUGIN #include EIGEN_TRANSFORM_PLUGIN #endif - + protected: #ifndef EIGEN_PARSED_BY_DOXYGEN EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params() @@ -755,7 +751,7 @@ template Transform& Transform::operator=(const QMatrix& other) { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -801,7 +797,7 @@ Transform& Transform::operator { check_template_params(); EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -819,7 +815,7 @@ template QTransform Transform::toQTransform(void) const { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (Mode == int(AffineCompact)) + if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(0,2), m_matrix.coeff(1,2)); @@ -912,7 +908,7 @@ EIGEN_DEVICE_FUNC Transform& Transform::pretranslate(const MatrixBase &other) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim)) - if(int(Mode)==int(Projective)) + if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective))) affine() += other * m_matrix.row(Dim); else translation() += other; @@ -1046,20 +1042,43 @@ EIGEN_DEVICE_FUNC inline Transform Transform struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + const typename TransformType::LinearMatrixType run(const TransformType& t) + { + typedef typename TransformType::LinearMatrixType LinearMatrixType; + LinearMatrixType result; + t.computeRotationScaling(&result, (LinearMatrixType*)0); + return result; + } +}; +template<> struct transform_rotation_impl { + template + EIGEN_DEVICE_FUNC static inline + typename TransformType::ConstLinearPart run(const TransformType& t) + { + return t.linear(); + } +}; +} /** \returns the rotation part of the transformation * + * If Mode==Isometry, then this method is an alias for linear(), + * otherwise it calls computeRotationScaling() to extract the rotation + * through a SVD decomposition. * * \svd_module * * \sa computeRotationScaling(), computeScalingRotation(), class SVD */ template -EIGEN_DEVICE_FUNC const typename Transform::LinearMatrixType +EIGEN_DEVICE_FUNC +typename Transform::RotationReturnType Transform::rotation() const { - LinearMatrixType result; - computeRotationScaling(&result, (LinearMatrixType*)0); - return result; + return internal::transform_rotation_impl::run(*this); } @@ -1078,17 +1097,18 @@ template template EIGEN_DEVICE_FUNC void Transform::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const { + // Note that JacobiSVD is faster than BDCSVD for small matrices. JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); - Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 + Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); - sv.coeffRef(0) *= x; - if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint()); + sv.coeffRef(Dim-1) *= x; + if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint(); if(rotation) { LinearMatrixType m(svd.matrixU()); - m.col(0) /= x; - rotation->lazyAssign(m * svd.matrixV().adjoint()); + m.col(Dim-1) *= x; + *rotation = m * svd.matrixV().adjoint(); } } @@ -1107,17 +1127,18 @@ template template EIGEN_DEVICE_FUNC void Transform::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const { + // Note that JacobiSVD is faster than BDCSVD for small matrices. JacobiSVD svd(linear(), ComputeFullU | ComputeFullV); - Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 + Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0) ? Scalar(-1) : Scalar(1); // so x has absolute value 1 VectorType sv(svd.singularValues()); - sv.coeffRef(0) *= x; - if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint()); + sv.coeffRef(Dim-1) *= x; + if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint(); if(rotation) { LinearMatrixType m(svd.matrixU()); - m.col(0) /= x; - rotation->lazyAssign(m * svd.matrixV().adjoint()); + m.col(Dim-1) *= x; + *rotation = m * svd.matrixV().adjoint(); } } @@ -1156,7 +1177,7 @@ struct transform_make_affine { template EIGEN_DEVICE_FUNC static void run(MatrixType &) { } }; - + // selector needed to avoid taking the inverse of a 3x4 matrix template struct projective_transform_inverse @@ -1297,8 +1318,8 @@ struct transform_construct_from_matrix struct transform_product_result { - enum - { + enum + { Mode = (LhsMode == (int)Projective || RhsMode == (int)Projective ) ? Projective : (LhsMode == (int)Affine || RhsMode == (int)Affine ) ? Affine : @@ -1312,7 +1333,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols> { typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { return T.matrix() * other; } @@ -1321,8 +1342,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 0, RhsCols> template< typename TransformType, typename MatrixType, int RhsCols> struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> { - enum { - Dim = TransformType::Dim, + enum { + Dim = TransformType::Dim, HDim = TransformType::HDim, OtherRows = MatrixType::RowsAtCompileTime, OtherCols = MatrixType::ColsAtCompileTime @@ -1330,7 +1351,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); @@ -1339,7 +1360,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> ResultType res(other.rows(),other.cols()); TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other; res.row(OtherRows-1) = other.row(OtherRows-1); - + return res; } }; @@ -1347,8 +1368,8 @@ struct transform_right_product_impl< TransformType, MatrixType, 1, RhsCols> template< typename TransformType, typename MatrixType, int RhsCols> struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols> { - enum { - Dim = TransformType::Dim, + enum { + Dim = TransformType::Dim, HDim = TransformType::HDim, OtherRows = MatrixType::RowsAtCompileTime, OtherCols = MatrixType::ColsAtCompileTime @@ -1356,7 +1377,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, RhsCols> typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); @@ -1381,7 +1402,7 @@ struct transform_right_product_impl< TransformType, MatrixType, 2, 1> // rhs is typedef typename MatrixType::PlainObject ResultType; - static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) { EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES); diff --git a/externals/eigen/Eigen/src/Geometry/Translation.h b/externals/eigen/Eigen/src/Geometry/Translation.h index 51d9a82e..8c229012 100644 --- a/externals/eigen/Eigen/src/Geometry/Translation.h +++ b/externals/eigen/Eigen/src/Geometry/Translation.h @@ -70,18 +70,18 @@ class Translation /** Constructs and initialize the translation transformation from a vector of translation coefficients */ EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {} - /** \brief Retruns the x-translation by value. **/ + /** \brief Returns the x-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); } - /** \brief Retruns the y-translation by value. **/ + /** \brief Returns the y-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); } - /** \brief Retruns the z-translation by value. **/ + /** \brief Returns the z-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); } - /** \brief Retruns the x-translation as a reference. **/ + /** \brief Returns the x-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); } - /** \brief Retruns the y-translation as a reference. **/ + /** \brief Returns the y-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); } - /** \brief Retruns the z-translation as a reference. **/ + /** \brief Returns the z-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); } EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; } @@ -138,12 +138,6 @@ class Translation /** \returns the inverse translation (opposite) */ Translation inverse() const { return Translation(-m_coeffs); } - Translation& operator=(const Translation& other) - { - m_coeffs = other.m_coeffs; - return *this; - } - static const Translation Identity() { return Translation(VectorType::Zero()); } /** \returns \c *this with scalar type casted to \a NewScalarType diff --git a/externals/eigen/Eigen/src/Geometry/Umeyama.h b/externals/eigen/Eigen/src/Geometry/Umeyama.h index 7e933fca..6b755008 100644 --- a/externals/eigen/Eigen/src/Geometry/Umeyama.h +++ b/externals/eigen/Eigen/src/Geometry/Umeyama.h @@ -87,7 +87,7 @@ struct umeyama_transform_matrix_type * \f{align*} * T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix} * \f} -* minimizing the resudiual above. This transformation is always returned as an +* minimizing the residual above. This transformation is always returned as an * Eigen::Matrix. */ template diff --git a/externals/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h b/externals/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h new file mode 100644 index 00000000..9af6a9af --- /dev/null +++ b/externals/eigen/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Rohit Garg +// Copyright (C) 2009-2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GEOMETRY_SIMD_H +#define EIGEN_GEOMETRY_SIMD_H + +namespace Eigen { + +namespace internal { + +template +struct quat_product +{ + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + Quaternion res; + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {0.f, 0.f, 0.f, neg_zero}; + const Packet4f mask = ploadu(arr); + Packet4f a = ae.template packet(0); + Packet4f b = be.template packet(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + pstoret( + &res.x(), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), + vec4f_swizzle1(b,1,2,0,0))), + pxor(mask,padd(s1,s2)))); + + return res; + } +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f}; + const Packet4f mask = ploadu(arr); + pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); + return res; + } +}; + + +template +struct cross3_impl +{ + enum { + ResAlignment = traits::type>::Alignment + }; + static inline typename plain_matrix_type::type + run(const VectorLhs& lhs, const VectorRhs& rhs) + { + evaluator lhs_eval(lhs); + evaluator rhs_eval(rhs); + Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); + Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); + Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); + typename plain_matrix_type::type res; + pstoret(&res.x(),psub(mul1,mul2)); + return res; + } +}; + + + +#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64) + +template +struct quat_product +{ + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) + { + Quaternion res; + + evaluator ae(_a.coeffs()); + evaluator be(_b.coeffs()); + + const double* a = _a.coeffs().data(); + Packet2d b_xy = be.template packet(0); + Packet2d b_zw = be.template packet(2); + Packet2d a_xx = pset1(a[0]); + Packet2d a_yy = pset1(a[1]); + Packet2d a_zz = pset1(a[2]); + Packet2d a_ww = pset1(a[3]); + + // two temporaries: + Packet2d t1, t2; + + /* + * t1 = ww*xy + yy*zw + * t2 = zz*xy - xx*zw + * res.xy = t1 +/- swap(t2) + */ + t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); + t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); + pstoret(&res.x(), paddsub(t1, preverse(t2))); + + /* + * t1 = ww*zw - yy*xy + * t2 = zz*zw + xx*xy + * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) + */ + t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); + t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); + pstoret(&res.z(), preverse(paddsub(preverse(t1), t2))); + + return res; +} +}; + +template +struct quat_conj +{ + enum { + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& q) + { + evaluator qe(q.coeffs()); + Quaternion res; + const double neg_zero = numext::bit_cast(0x8000000000000000ull); + const double arr1[2] = {neg_zero, neg_zero}; + const double arr2[2] = {neg_zero, 0.0}; + const Packet2d mask0 = ploadu(arr1); + const Packet2d mask2 = ploadu(arr2); + pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); + pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); + return res; + } +}; + +#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64 + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GEOMETRY_SIMD_H diff --git a/externals/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h b/externals/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h deleted file mode 100644 index 1a86ff83..00000000 --- a/externals/eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +++ /dev/null @@ -1,141 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Rohit Garg -// Copyright (C) 2009-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GEOMETRY_SSE_H -#define EIGEN_GEOMETRY_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct quat_product -{ - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstore(&res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), - vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); - - return res; - } -}; - -template -struct quat_conj -{ - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet(0))); - return res; - } -}; - - -template -struct cross3_impl -{ - static inline typename plain_matrix_type::type - run(const VectorLhs& lhs, const VectorRhs& rhs) - { - __m128 a = lhs.template packet::Alignment>(0); - __m128 b = rhs.template packet::Alignment>(0); - __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); - typename plain_matrix_type::type res; - pstore(&res.x(),_mm_sub_ps(mul1,mul2)); - return res; - } -}; - - - - -template -struct quat_product -{ - static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) - { - const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - - Quaternion res; - - const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); - Packet2d a_xx = pset1(a[0]); - Packet2d a_yy = pset1(a[1]); - Packet2d a_zz = pset1(a[2]); - Packet2d a_ww = pset1(a[3]); - - // two temporaries: - Packet2d t1, t2; - - /* - * t1 = ww*xy + yy*zw - * t2 = zz*xy - xx*zw - * res.xy = t1 +/- swap(t2) - */ - t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); - t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2))); -#else - pstore(&res.x(), padd(t1, pxor(mask,preverse(t2)))); -#endif - - /* - * t1 = ww*zw - yy*xy - * t2 = zz*zw + xx*xy - * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) - */ - t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); - t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); -#else - pstore(&res.z(), psub(t1, pxor(mask,preverse(t2)))); -#endif - - return res; -} -}; - -template -struct quat_conj -{ - static inline Quaternion run(const QuaternionBase& q) - { - Quaternion res; - const __m128d mask0 = _mm_setr_pd(-0.,-0.); - const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet(0))); - pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet(2))); - return res; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GEOMETRY_SSE_H diff --git a/externals/eigen/Eigen/src/Householder/BlockHouseholder.h b/externals/eigen/Eigen/src/Householder/BlockHouseholder.h index 01a7ed18..39ce1c2a 100644 --- a/externals/eigen/Eigen/src/Householder/BlockHouseholder.h +++ b/externals/eigen/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); diff --git a/externals/eigen/Eigen/src/Householder/Householder.h b/externals/eigen/Eigen/src/Householder/Householder.h index 80de2c30..5bc037f0 100644 --- a/externals/eigen/Eigen/src/Householder/Householder.h +++ b/externals/eigen/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -103,13 +105,14 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->cols() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -140,13 +143,14 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->rows() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, @@ -160,10 +164,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential.conjugate(); + tmp.noalias() = right * essential; tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.transpose(); + right.noalias() -= tau * tmp * essential.adjoint(); } } diff --git a/externals/eigen/Eigen/src/Householder/HouseholderSequence.h b/externals/eigen/Eigen/src/Householder/HouseholderSequence.h index 3ce0a693..022f6c3d 100644 --- a/externals/eigen/Eigen/src/Householder/HouseholderSequence.h +++ b/externals/eigen/Eigen/src/Householder/HouseholderSequence.h @@ -11,7 +11,7 @@ #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H #define EIGEN_HOUSEHOLDER_SEQUENCE_H -namespace Eigen { +namespace Eigen { /** \ingroup Householder_Module * \householder_module @@ -34,8 +34,8 @@ namespace Eigen { * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$ * v_i \f$ is a vector of the form - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector. * @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -120,7 +120,7 @@ template class HouseholderS : public EigenBase > { typedef typename internal::hseq_side_dependent_impl::EssentialVectorType EssentialVectorType; - + public: enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, @@ -140,6 +140,28 @@ template class HouseholderS Side > ConjugateReturnType; + typedef HouseholderSequence< + VectorsType, + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + CoeffsType>::type, + Side + > AdjointReturnType; + + typedef HouseholderSequence< + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + VectorsType>::type, + CoeffsType, + Side + > TransposeReturnType; + + typedef HouseholderSequence< + typename internal::add_const::type, + typename internal::add_const::type, + Side + > ConstHouseholderSequence; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -157,33 +179,37 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) { } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_trans(other.m_trans), + m_reverse(other.m_reverse), m_length(other.m_length), m_shift(other.m_shift) { } /** \brief Number of rows of transformation viewed as a matrix. - * \returns Number of rows + * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ - Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ - Index cols() const { return rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return rows(); } /** \brief Essential part of a Householder vector. * \param[in] k Index of Householder reflection @@ -191,14 +217,15 @@ template class HouseholderS * * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector - * \f[ - * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. + * \f[ + * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. * \f] * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v * passed to the constructor. * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -206,31 +233,51 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - HouseholderSequence transpose() const + TransposeReturnType transpose() const { - return HouseholderSequence(*this).setTrans(!m_trans); + return TransposeReturnType(m_vectors.conjugate(), m_coeffs) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Complex conjugate of the Householder sequence. */ ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setTrans(m_trans) + .setReverseFlag(m_reverse) .setLength(m_length) .setShift(m_shift); } + /** \returns an expression of the complex conjugate of \c *this if Cond==true, + * returns \c *this otherwise. + */ + template + EIGEN_DEVICE_FUNC + inline typename internal::conditional::type + conjugateIf() const + { + typedef typename internal::conditional::type ReturnType; + return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); + } + /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - ConjugateReturnType adjoint() const + AdjointReturnType adjoint() const { - return conjugate().setTrans(!m_trans); + return AdjointReturnType(m_vectors, m_coeffs.conjugate()) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - ConjugateReturnType inverse() const { return adjoint(); } + AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -239,6 +286,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -251,7 +299,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -265,18 +313,26 @@ template class HouseholderS for(Index k = 0; kBlockSize) + { + dst.setIdentity(rows(), rows()); + if(m_reverse) + applyThisOnTheLeft(dst,workspace,true); + else + applyThisOnTheLeft(dst,workspace,true); + } else { dst.setIdentity(rows(), rows()); for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); } } } @@ -295,42 +351,52 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? m_length-k-1 : k; + Index actual_k = m_reverse ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } /** \internal */ - template inline void applyThisOnTheLeft(Dest& dst) const + template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const { Matrix workspace; - applyThisOnTheLeft(dst, workspace); + applyThisOnTheLeft(dst, workspace, inputIsIdentity); } /** \internal */ template - inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const + inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const { - const Index BlockSize = 48; + if(inputIsIdentity && m_reverse) + inputIsIdentity = false; // if the entries are large enough, then apply the reflectors by block if(m_length>=BlockSize && dst.cols()>1) { - for(Index i = 0; i < m_length; i+=BlockSize) + // Make sure we have at least 2 useful blocks, otherwise it is point-less: + Index blockSize = m_length::type,Dynamic,Dynamic> SubVectorsType; SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start, Side==OnTheRight ? start : k, Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); - Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); + + Index dstStart = dst.rows()-rows()+m_shift+k; + Index dstRows = rows()-m_shift-k; + Block sub_dst(dst, + dstStart, + inputIsIdentity ? dstStart : 0, + dstRows, + inputIsIdentity ? dstRows : dst.cols()); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); } } else @@ -338,8 +404,9 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? k : m_length-k-1; - dst.bottomRows(rows()-m_shift-actual_k) + Index actual_k = m_reverse ? k : m_length-k-1; + Index dstStart = rows()-m_shift-actual_k; + dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } @@ -357,7 +424,7 @@ template class HouseholderS { typename internal::matrix_type_times_scalar_type::Type res(other.template cast::ResultScalar>()); - applyThisOnTheLeft(res); + applyThisOnTheLeft(res, internal::is_identity::value && res.rows()==res.cols()); return res; } @@ -372,6 +439,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -389,13 +457,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ @@ -403,27 +475,30 @@ template class HouseholderS protected: - /** \brief Sets the transpose flag. - * \param [in] trans New value of the transpose flag. + /** \internal + * \brief Sets the reverse flag. + * \param [in] reverse New value of the reverse flag. * - * By default, the transpose flag is not set. If the transpose flag is set, then this object represents - * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * By default, the reverse flag is not set. If the reverse flag is set, then this object represents + * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. * - * \sa trans() + * \sa reverseFlag(), transpose(), adjoint() */ - HouseholderSequence& setTrans(bool trans) + HouseholderSequence& setReverseFlag(bool reverse) { - m_trans = trans; + m_reverse = reverse; return *this; } - bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ + bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_trans; + bool m_reverse; Index m_length; Index m_shift; + enum { BlockSize = 48 }; }; /** \brief Computes the product of a matrix with a Householder sequence. @@ -444,7 +519,7 @@ typename internal::matrix_type_times_scalar_type @@ -454,7 +529,7 @@ HouseholderSequence householderSequence(const VectorsTyp } /** \ingroup Householder_Module \householder_module - * \brief Convenience function for constructing a Householder sequence. + * \brief Convenience function for constructing a Householder sequence. * \returns A HouseholderSequence constructed from the specified arguments. * \details This function differs from householderSequence() in that the template argument \p OnTheSide of * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft. diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 358444af..a117fc15 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -10,7 +10,7 @@ #ifndef EIGEN_BASIC_PRECONDITIONERS_H #define EIGEN_BASIC_PRECONDITIONERS_H -namespace Eigen { +namespace Eigen { /** \ingroup IterativeLinearSolvers_Module * \brief A preconditioner based on the digonal entries @@ -52,15 +52,15 @@ class DiagonalPreconditioner compute(mat); } - Index rows() const { return m_invdiag.size(); } - Index cols() const { return m_invdiag.size(); } - + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_invdiag.size(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_invdiag.size(); } + template DiagonalPreconditioner& analyzePattern(const MatType& ) { return *this; } - + template DiagonalPreconditioner& factorize(const MatType& mat) { @@ -77,7 +77,7 @@ class DiagonalPreconditioner m_isInitialized = true; return *this; } - + template DiagonalPreconditioner& compute(const MatType& mat) { @@ -99,7 +99,7 @@ class DiagonalPreconditioner && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b"); return Solve(*this, b.derived()); } - + ComputationInfo info() { return Success; } protected: @@ -121,7 +121,7 @@ class DiagonalPreconditioner * \implsparsesolverconcept * * The diagonal entries are pre-inverted and stored into a dense vector. - * + * * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner */ template @@ -146,30 +146,45 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { return *this; } - + template LeastSquareDiagonalPreconditioner& factorize(const MatType& mat) { // Compute the inverse squared-norm of each column of mat m_invdiag.resize(mat.cols()); - for(Index j=0; j0) - m_invdiag(j) = RealScalar(1)/sum; - else - m_invdiag(j) = RealScalar(1); + m_invdiag.setZero(); + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j)); + } + else + { + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/sum; + else + m_invdiag(j) = RealScalar(1); + } } Base::m_isInitialized = true; return *this; } - + template LeastSquareDiagonalPreconditioner& compute(const MatType& mat) { return factorize(mat); } - + ComputationInfo info() { return Success; } protected: @@ -190,19 +205,19 @@ class IdentityPreconditioner template explicit IdentityPreconditioner(const MatrixType& ) {} - + template IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; } - + template IdentityPreconditioner& factorize(const MatrixType& ) { return *this; } template IdentityPreconditioner& compute(const MatrixType& ) { return *this; } - + template inline const Rhs& solve(const Rhs& b) const { return b; } - + ComputationInfo info() { return Success; } }; diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 454f4681..153acef6 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -191,32 +191,16 @@ class BiCGSTAB : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { - bool failed = false; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.resize(this->rows(),b.cols()); - x.setZero(); - _solve_with_guess_impl(b,x); } protected: diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 395daa8e..5d8c6b43 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = 0; return; } - RealScalar threshold = tol*tol*rhsNorm2; + const RealScalar considerAsZero = (std::numeric_limits::min)(); + RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = sqrt(residualNorm2 / rhsNorm2); return; } - + VectorType p(n); p = precond.solve(residual); // initial search direction @@ -194,7 +195,7 @@ class ConjugateGradient : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { typedef typename Base::MatrixWrapper MatrixWrapper; typedef typename Base::ActualMatrixType ActualMatrixType; @@ -210,31 +211,14 @@ class ConjugateGradient : public IterativeSolverBase::Type >::type SelfAdjointWrapper; + m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.setZero(); - _solve_with_guess_impl(b.derived(),x); - } protected: diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index e45c272b..7803fd81 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -14,8 +14,8 @@ #include #include -namespace Eigen { -/** +namespace Eigen { +/** * \brief Modified Incomplete Cholesky with dual threshold * * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with @@ -41,28 +41,22 @@ namespace Eigen { * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ -template -#else -NaturalOrdering -#endif -> +template > class IncompleteCholesky : public SparseSolverBase > { protected: typedef SparseSolverBase > Base; using Base::m_isInitialized; public: - typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Real RealScalar; typedef _OrderingType OrderingType; typedef typename OrderingType::PermutationType PermutationType; - typedef typename PermutationType::StorageIndex StorageIndex; + typedef typename PermutationType::StorageIndex StorageIndex; typedef SparseMatrix FactorType; typedef Matrix VectorSx; typedef Matrix VectorRx; typedef Matrix VectorIx; - typedef std::vector > VectorList; + typedef std::vector > VectorList; enum { UpLo = _UpLo }; enum { ColsAtCompileTime = Dynamic, @@ -76,22 +70,22 @@ class IncompleteCholesky : public SparseSolverBase - IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false) + IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) { compute(matrix); } - + /** \returns number of rows of the factored matrix */ - Index rows() const { return m_L.rows(); } - + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_L.rows(); } + /** \returns number of columns of the factored matrix */ - Index cols() const { return m_L.cols(); } - + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_L.cols(); } + /** \brief Reports whether previous computation was successful. * @@ -106,19 +100,19 @@ class IncompleteCholesky : public SparseSolverBase void analyzePattern(const MatrixType& mat) { - OrderingType ord; + OrderingType ord; PermutationType pinv; - ord(mat.template selfadjointView(), pinv); + ord(mat.template selfadjointView(), pinv); if(pinv.size()>0) m_perm = pinv.inverse(); else m_perm.resize(0); m_L.resize(mat.rows(), mat.cols()); @@ -126,7 +120,7 @@ class IncompleteCholesky : public SparseSolverBase void factorize(const MatrixType& mat); - + /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat * * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods. @@ -149,7 +143,7 @@ class IncompleteCholesky : public SparseSolverBase void _solve_impl(const Rhs& b, Dest& x) const @@ -176,16 +170,16 @@ class IncompleteCholesky : public SparseSolverBase colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); -}; + inline void updateList(Ref colPtr, Ref rowIdx, Ref vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); +}; // Based on the following paper: // C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with @@ -196,10 +190,10 @@ template void IncompleteCholesky::factorize(const _MatrixType& mat) { using std::sqrt; - eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); - + eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); + // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added - + // Apply the fill-reducing permutation computed in analyzePattern() if (m_perm.rows() == mat.rows() ) // To detect the null permutation { @@ -212,8 +206,8 @@ void IncompleteCholesky::factorize(const _MatrixType { m_L.template selfadjointView() = mat.template selfadjointView<_UpLo>(); } - - Index n = m_L.cols(); + + Index n = m_L.cols(); Index nnz = m_L.nonZeros(); Map vals(m_L.valuePtr(), nnz); //values Map rowIdx(m_L.innerIndexPtr(), nnz); //Row indices @@ -225,9 +219,9 @@ void IncompleteCholesky::factorize(const _MatrixType VectorIx col_pattern(n); col_pattern.fill(-1); StorageIndex col_nnz; - - - // Computes the scaling factors + + + // Computes the scaling factors m_scale.resize(n); m_scale.setZero(); for (Index j = 0; j < n; j++) @@ -237,7 +231,7 @@ void IncompleteCholesky::factorize(const _MatrixType if(rowIdx[k]!=j) m_scale(rowIdx[k]) += numext::abs2(vals(k)); } - + m_scale = m_scale.cwiseSqrt().cwiseSqrt(); for (Index j = 0; j < n; ++j) @@ -247,8 +241,8 @@ void IncompleteCholesky::factorize(const _MatrixType m_scale(j) = 1; // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster) - - // Scale and compute the shift for the matrix + + // Scale and compute the shift for the matrix RealScalar mindiag = NumTraits::highest(); for (Index j = 0; j < n; j++) { @@ -259,7 +253,7 @@ void IncompleteCholesky::factorize(const _MatrixType } FactorType L_save = m_L; - + RealScalar shift = 0; if(mindiag <= RealScalar(0.)) shift = m_initialShift - mindiag; @@ -381,7 +375,7 @@ inline void IncompleteCholesky::updateList(Ref::updateList(Ref= abs(row(ncut)) if incut + * abs(row(i)) <= abs(row(ncut)) if i>ncut * \param row The vector of values * \param ind The array of index for the elements in @p row * \param ncut The number of largest elements to keep - **/ + **/ template Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) { @@ -34,15 +34,15 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) Index mid; Index n = row.size(); /* length of the vector */ Index first, last ; - + ncut--; /* to fit the zero-based indices */ - first = 0; - last = n-1; + first = 0; + last = n-1; if (ncut < first || ncut > last ) return 0; - + do { - mid = first; - RealScalar abskey = abs(row(mid)); + mid = first; + RealScalar abskey = abs(row(mid)); for (Index j = first + 1; j <= last; j++) { if ( abs(row(j)) > abskey) { ++mid; @@ -53,12 +53,12 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) /* Interchange for the pivot element */ swap(row(mid), row(first)); swap(ind(mid), ind(first)); - + if (mid > ncut) last = mid - 1; - else if (mid < ncut ) first = mid + 1; + else if (mid < ncut ) first = mid + 1; } while (mid != ncut ); - - return 0; /* mid is equal to ncut */ + + return 0; /* mid is equal to ncut */ } }// end namespace internal @@ -71,23 +71,23 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut) * * During the numerical factorization, two dropping rules are used : * 1) any element whose magnitude is less than some tolerance is dropped. - * This tolerance is obtained by multiplying the input tolerance @p droptol + * This tolerance is obtained by multiplying the input tolerance @p droptol * by the average magnitude of all the original elements in the current row. - * 2) After the elimination of the row, only the @p fill largest elements in - * the L part and the @p fill largest elements in the U part are kept - * (in addition to the diagonal element ). Note that @p fill is computed from - * the input parameter @p fillfactor which is used the ratio to control the fill_in + * 2) After the elimination of the row, only the @p fill largest elements in + * the L part and the @p fill largest elements in the U part are kept + * (in addition to the diagonal element ). Note that @p fill is computed from + * the input parameter @p fillfactor which is used the ratio to control the fill_in * relatively to the initial number of nonzero elements. - * + * * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements) - * and when @p fill=n/2 with @p droptol being different to zero. - * - * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, + * and when @p fill=n/2 with @p droptol being different to zero. + * + * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, * Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994. - * + * * NOTE : The following implementation is derived from the ILUT implementation - * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota - * released under the terms of the GNU LGPL: + * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota + * released under the terms of the GNU LGPL: * http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2. * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012: @@ -115,28 +115,28 @@ class IncompleteLUT : public SparseSolverBase::dummy_precision()), m_fillfactor(10), m_analysisIsOk(false), m_factorizationIsOk(false) {} - + template explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits::dummy_precision(), int fillfactor = 10) : m_droptol(droptol),m_fillfactor(fillfactor), m_analysisIsOk(false),m_factorizationIsOk(false) { eigen_assert(fillfactor != 0); - compute(mat); + compute(mat); } - - Index rows() const { return m_lu.rows(); } - - Index cols() const { return m_lu.cols(); } + + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -144,36 +144,36 @@ class IncompleteLUT : public SparseSolverBase void analyzePattern(const MatrixType& amat); - + template void factorize(const MatrixType& amat); - + /** * Compute an incomplete LU factorization with dual threshold on the matrix mat * No pivoting is done in this version - * + * **/ template IncompleteLUT& compute(const MatrixType& amat) { - analyzePattern(amat); + analyzePattern(amat); factorize(amat); return *this; } - void setDroptol(const RealScalar& droptol); - void setFillfactor(int fillfactor); - + void setDroptol(const RealScalar& droptol); + void setFillfactor(int fillfactor); + template void _solve_impl(const Rhs& b, Dest& x) const { x = m_Pinv * b; x = m_lu.template triangularView().solve(x); x = m_lu.template triangularView().solve(x); - x = m_P * x; + x = m_P * x; } protected: @@ -200,22 +200,22 @@ class IncompleteLUT : public SparseSolverBase void IncompleteLUT::setDroptol(const RealScalar& droptol) { - this->m_droptol = droptol; + this->m_droptol = droptol; } /** * Set control parameter fillfactor - * \param fillfactor This is used to compute the number @p fill_in of largest elements to keep on each row. - **/ + * \param fillfactor This is used to compute the number @p fill_in of largest elements to keep on each row. + **/ template void IncompleteLUT::setFillfactor(int fillfactor) { - this->m_fillfactor = fillfactor; + this->m_fillfactor = fillfactor; } template @@ -225,24 +225,15 @@ void IncompleteLUT::analyzePattern(const _MatrixType& amat) // Compute the Fill-reducing permutation // Since ILUT does not perform any numerical pivoting, // it is highly preferable to keep the diagonal through symmetric permutations. -#ifndef EIGEN_MPL2_ONLY // To this end, let's symmetrize the pattern and perform AMD on it. SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); m_Pinv = m_P.inverse(); // cache the inverse permutation -#else - // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine. - SparseMatrix mat1 = amat; - COLAMDOrdering ordering; - ordering(mat1,m_Pinv); - m_P = m_Pinv.inverse(); -#endif - m_analysisIsOk = true; m_factorizationIsOk = false; m_isInitialized = true; diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 7c2326eb..28a0c510 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -10,7 +10,7 @@ #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H #define EIGEN_ITERATIVE_SOLVER_BASE_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -145,7 +145,7 @@ class IterativeSolverBase : public SparseSolverBase protected: typedef SparseSolverBase Base; using Base::m_isInitialized; - + public: typedef typename internal::traits::MatrixType MatrixType; typedef typename internal::traits::Preconditioner Preconditioner; @@ -169,10 +169,10 @@ class IterativeSolverBase : public SparseSolverBase } /** Initialize the solver with matrix \a A for further \c Ax=b solving. - * + * * This constructor is a shortcut for the default constructor followed * by a call to compute(). - * + * * \warning this class stores a reference to the matrix A as well as some * precomputed values that depend on it. Therefore, if \a A is changed * this class becomes invalid. Call compute() to update it with the new @@ -187,7 +187,7 @@ class IterativeSolverBase : public SparseSolverBase } ~IterativeSolverBase() {} - + /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems. * * Currently, this function mostly calls analyzePattern on the preconditioner. In the future @@ -203,7 +203,7 @@ class IterativeSolverBase : public SparseSolverBase m_info = m_preconditioner.info(); return derived(); } - + /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems. * * Currently, this function mostly calls factorize on the preconditioner. @@ -216,7 +216,7 @@ class IterativeSolverBase : public SparseSolverBase template Derived& factorize(const EigenBase& A) { - eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); + eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); grab(A.derived()); m_preconditioner.factorize(matrix()); m_factorizationIsOk = true; @@ -247,16 +247,16 @@ class IterativeSolverBase : public SparseSolverBase } /** \internal */ - Index rows() const { return matrix().rows(); } + EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { return matrix().rows(); } /** \internal */ - Index cols() const { return matrix().cols(); } + EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { return matrix().cols(); } /** \returns the tolerance threshold used by the stopping criteria. * \sa setTolerance() */ RealScalar tolerance() const { return m_tolerance; } - + /** Sets the tolerance threshold used by the stopping criteria. * * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|. @@ -270,19 +270,19 @@ class IterativeSolverBase : public SparseSolverBase /** \returns a read-write reference to the preconditioner for custom configuration. */ Preconditioner& preconditioner() { return m_preconditioner; } - + /** \returns a read-only reference to the preconditioner. */ const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value setted by setMaxIterations or, by default, + * It is either the value set by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const { return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations; } - + /** Sets the max number of iterations. * Default is twice the number of columns of the matrix. */ @@ -328,13 +328,13 @@ class IterativeSolverBase : public SparseSolverBase eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized."); return m_info; } - + /** \internal */ template - void _solve_impl(const Rhs& b, SparseMatrixBase &aDest) const + void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase &aDest) const { eigen_assert(rows()==b.rows()); - + Index rhsCols = b.cols(); Index size = b.rows(); DestDerived& dest(aDest.derived()); @@ -344,15 +344,65 @@ class IterativeSolverBase : public SparseSolverBase // We do not directly fill dest because sparse expressions have to be free of aliasing issue. // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other. typename DestDerived::PlainObject tmp(cols(),rhsCols); + ComputationInfo global_info = Success; for(Index k=0; k + typename internal::enable_if::type + _solve_with_guess_impl(const Rhs& b, MatrixBase &aDest) const + { + eigen_assert(rows()==b.rows()); + + Index rhsCols = b.cols(); + DestDerived& dest(aDest.derived()); + ComputationInfo global_info = Success; + for(Index k=0; k + typename internal::enable_if::type + _solve_with_guess_impl(const Rhs& b, MatrixBase &dest) const + { + derived()._solve_vector_with_guess_impl(b,dest.derived()); + } + + /** \internal default initial guess = 0 */ + template + void _solve_impl(const Rhs& b, Dest& x) const + { + x.setZero(); + derived()._solve_with_guess_impl(b,x); + } + protected: void init() { @@ -370,19 +420,19 @@ class IterativeSolverBase : public SparseSolverBase { return m_matrixWrapper.matrix(); } - + template void grab(const InputType &A) { m_matrixWrapper.grab(A); } - + MatrixWrapper m_matrixWrapper; Preconditioner m_preconditioner; Index m_maxIterations; RealScalar m_tolerance; - + mutable RealScalar m_error; mutable Index m_iterations; mutable ComputationInfo m_info; diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h index 0aea0e09..203fd0ec 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -182,32 +182,14 @@ class LeastSquaresConjugateGradient : public IterativeSolverBase - void _solve_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const { m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - for(Index j=0; j - void _solve_impl(const MatrixBase& b, Dest& x) const - { - x.setZero(); - _solve_with_guess_impl(b.derived(),x); - } }; diff --git a/externals/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/externals/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h index 0ace4517..7b896575 100644 --- a/externals/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +++ b/externals/eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h @@ -13,7 +13,7 @@ namespace Eigen { template class SolveWithGuess; - + /** \class SolveWithGuess * \ingroup IterativeLinearSolvers_Module * @@ -45,13 +45,15 @@ class SolveWithGuess : public internal::generic_xpr_base::PlainObject PlainObject; typedef typename internal::generic_xpr_base, MatrixXpr, typename internal::traits::StorageKind>::type Base; typedef typename internal::ref_selector::type Nested; - + SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess) : m_dec(dec), m_rhs(rhs), m_guess(guess) {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); } + + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index rows() const EIGEN_NOEXCEPT { return m_dec.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + Index cols() const EIGEN_NOEXCEPT { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; } EIGEN_DEVICE_FUNC const RhsType& rhs() const { return m_rhs; } @@ -61,7 +63,7 @@ class SolveWithGuess : public internal::generic_xpr_base > m_result = solve.guess(); solve.dec()._solve_with_guess_impl(solve.rhs(), m_result); } - -protected: + +protected: PlainObject m_result; }; @@ -108,7 +110,7 @@ struct Assignment, interna } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/Jacobi/Jacobi.h b/externals/eigen/Eigen/src/Jacobi/Jacobi.h index d25af8e9..76668a57 100644 --- a/externals/eigen/Eigen/src/Jacobi/Jacobi.h +++ b/externals/eigen/Eigen/src/Jacobi/Jacobi.h @@ -11,7 +11,7 @@ #ifndef EIGEN_JACOBI_H #define EIGEN_JACOBI_H -namespace Eigen { +namespace Eigen { /** \ingroup Jacobi_Module * \jacobi_module @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,20 +59,27 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; }; @@ -80,11 +90,12 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; using std::abs; - typedef typename NumTraits::Real RealScalar; + RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -124,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -133,7 +145,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields: * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$. * - * The value of \a z is returned if \a z is not null (the default is null). + * The value of \a r is returned if \a r is not null (the default is null). * Also note that G is built such that the cosine is always real. * * Example: \include Jacobi_makeGivens.cpp @@ -146,20 +158,22 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) +EIGEN_DEVICE_FUNC +void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { - makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); + makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); } // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; using std::abs; using numext::conj; - + if(q==Scalar(0)) { m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1); @@ -213,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -258,12 +273,13 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar namespace internal { /** \jacobi_module - * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y: + * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y: * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right ) = J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$ * * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } @@ -275,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -290,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -298,132 +316,164 @@ inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { -template -void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) -{ - typedef typename VectorX::Scalar Scalar; - enum { PacketSize = packet_traits::size }; - typedef typename packet_traits::type Packet; - eigen_assert(xpr_x.size() == xpr_y.size()); - Index size = xpr_x.size(); - Index incrx = xpr_x.derived().innerStride(); - Index incry = xpr_y.derived().innerStride(); - - Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); - Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); - - OtherScalar c = j.c(); - OtherScalar s = j.s(); - if (c==OtherScalar(1) && s==OtherScalar(0)) - return; - /*** dynamic-size vectorized paths ***/ +template +struct apply_rotation_in_the_plane_selector +{ + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; + for(Index i=alignedStart; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } } - } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i(px); - Packet xi1 = ploadu(px+PacketSize); - Packet yi = pload (py); - Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i(px); + Packet xi1 = ploadu(px+PacketSize); + Packet yi = pload (py); + Packet yi1 = pload (py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu(x+peelingEnd); + Packet yi = pload (y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } } - if(alignedEnd!=peelingEnd) + + for(Index i=alignedEnd; i(x+peelingEnd); - Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + Scalar xi = x[i]; + Scalar yi = y[i]; + x[i] = c * xi + numext::conj(s) * yi; + y[i] = -s * xi + numext::conj(c) * yi; } } - for(Index i=alignedEnd; i0) // FIXME should be compared to the required alignment { - Scalar xi = x[i]; - Scalar yi = y[i]; - x[i] = c * xi + numext::conj(s) * yi; - y[i] = -s * xi + numext::conj(c) * yi; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } } - } - /*** fixed-size vectorized path ***/ - else if(VectorX::SizeAtCompileTime != Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - (EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment - { - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; + apply_rotation_in_the_plane_selector::run(x,incrx,y,incry,size,c,s); } } +}; - /*** non-vectorized path ***/ - else - { - for(Index i=0; i +EIGEN_DEVICE_FUNC +void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) +{ + typedef typename VectorX::Scalar Scalar; + const bool Vectorizable = (int(VectorX::Flags) & int(VectorY::Flags) & PacketAccessBit) + && (int(packet_traits::size) == int(packet_traits::size)); + + eigen_assert(xpr_x.size() == xpr_y.size()); + Index size = xpr_x.size(); + Index incrx = xpr_x.derived().innerStride(); + Index incry = xpr_y.derived().innerStride(); + + Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); + Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); + + OtherScalar c = j.c(); + OtherScalar s = j.s(); + if (c==OtherScalar(1) && s==OtherScalar(0)) + return; + + apply_rotation_in_the_plane_selector< + Scalar,OtherScalar, + VectorX::SizeAtCompileTime, + EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal diff --git a/externals/eigen/Eigen/src/KLUSupport/KLUSupport.h b/externals/eigen/Eigen/src/KLUSupport/KLUSupport.h new file mode 100644 index 00000000..215db35b --- /dev/null +++ b/externals/eigen/Eigen/src/KLUSupport/KLUSupport.h @@ -0,0 +1,358 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Kyle Macfarlan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_H +#define EIGEN_KLUSUPPORT_H + +namespace Eigen { + +/* TODO extract L, extract U, compute det, etc... */ + +/** \ingroup KLUSupport_Module + * \brief A sparse LU factorization and solver based on KLU + * + * This class allows to solve for A.X = B sparse linear problems via a LU factorization + * using the KLU library. The sparse matrix A must be squared and full rank. + * The vectors or matrices X and B can be either dense or sparse. + * + * \warning The input matrix A should be in a \b compressed and \b column-major form. + * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. + * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU + */ + + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), 0, Common); +} + +inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { + return klu_factor(Ap, Ai, Ax, Symbolic, Common); +} + +inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { + return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); +} + + +template +class KLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix Vector; + typedef Matrix IntRowVectorType; + typedef Matrix IntColVectorType; + typedef SparseMatrix LUMatrixType; + typedef SparseMatrix KLUMatrixType; + typedef Ref KLUMatrixRef; + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + public: + + KLU() + : m_dummy(0,0), mp_matrix(m_dummy) + { + init(); + } + + template + explicit KLU(const InputMatrixType& matrix) + : mp_matrix(matrix) + { + init(); + compute(matrix); + } + + ~KLU() + { + if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); + if(m_numeric) klu_free_numeric(&m_numeric,&m_common); + } + + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return mp_matrix.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return mp_matrix.cols(); } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful, + * \c NumericalIssue if the matrix.appears to be negative. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } +#if 0 // not implemented yet + inline const LUMatrixType& matrixL() const + { + if (m_extractedDataAreDirty) extractData(); + return m_l; + } + + inline const LUMatrixType& matrixU() const + { + if (m_extractedDataAreDirty) extractData(); + return m_u; + } + + inline const IntColVectorType& permutationP() const + { + if (m_extractedDataAreDirty) extractData(); + return m_p; + } + + inline const IntRowVectorType& permutationQ() const + { + if (m_extractedDataAreDirty) extractData(); + return m_q; + } +#endif + /** Computes the sparse Cholesky decomposition of \a matrix + * Note that the matrix should be column-major, and in compressed format for best performance. + * \sa SparseMatrix::makeCompressed(). + */ + template + void compute(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + grab(matrix.derived()); + analyzePattern_impl(); + factorize_impl(); + } + + /** Performs a symbolic decomposition on the sparcity of \a matrix. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize(), compute() + */ + template + void analyzePattern(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + + grab(matrix.derived()); + + analyzePattern_impl(); + } + + + /** Provides access to the control settings array used by KLU. + * + * See KLU documentation for details. + */ + inline const klu_common& kluCommon() const + { + return m_common; + } + + /** Provides access to the control settings array used by UmfPack. + * + * If this array contains NaN's, the default values are used. + * + * See KLU documentation for details. + */ + inline klu_common& kluCommon() + { + return m_common; + } + + /** Performs a numeric decomposition of \a matrix + * + * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. + * + * \sa analyzePattern(), compute() + */ + template + void factorize(const InputMatrixType& matrix) + { + eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); + if(m_numeric) + klu_free_numeric(&m_numeric,&m_common); + + grab(matrix.derived()); + + factorize_impl(); + } + + /** \internal */ + template + bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; + +#if 0 // not implemented yet + Scalar determinant() const; + + void extractData() const; +#endif + + protected: + + void init() + { + m_info = InvalidInput; + m_isInitialized = false; + m_numeric = 0; + m_symbolic = 0; + m_extractedDataAreDirty = true; + + klu_defaults(&m_common); + } + + void analyzePattern_impl() + { + m_info = InvalidInput; + m_analysisIsOk = false; + m_factorizationIsOk = false; + m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), + const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), + &m_common); + if (m_symbolic) { + m_isInitialized = true; + m_info = Success; + m_analysisIsOk = true; + m_extractedDataAreDirty = true; + } + } + + void factorize_impl() + { + + m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), + m_symbolic, &m_common, Scalar()); + + + m_info = m_numeric ? Success : NumericalIssue; + m_factorizationIsOk = m_numeric ? 1 : 0; + m_extractedDataAreDirty = true; + } + + template + void grab(const EigenBase &A) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A.derived()); + } + + void grab(const KLUMatrixRef &A) + { + if(&(A.derived()) != &mp_matrix) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A); + } + } + + // cached data to reduce reallocation, etc. +#if 0 // not implemented yet + mutable LUMatrixType m_l; + mutable LUMatrixType m_u; + mutable IntColVectorType m_p; + mutable IntRowVectorType m_q; +#endif + + KLUMatrixType m_dummy; + KLUMatrixRef mp_matrix; + + klu_numeric* m_numeric; + klu_symbolic* m_symbolic; + klu_common m_common; + mutable ComputationInfo m_info; + int m_factorizationIsOk; + int m_analysisIsOk; + mutable bool m_extractedDataAreDirty; + + private: + KLU(const KLU& ) { } +}; + +#if 0 // not implemented yet +template +void KLU::extractData() const +{ + if (m_extractedDataAreDirty) + { + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + + // get size of the data + int lnz, unz, rows, cols, nz_udiag; + umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); + + // allocate data + m_l.resize(rows,(std::min)(rows,cols)); + m_l.resizeNonZeros(lnz); + + m_u.resize((std::min)(rows,cols),cols); + m_u.resizeNonZeros(unz); + + m_p.resize(rows); + m_q.resize(cols); + + // extract + umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), + m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), + m_p.data(), m_q.data(), 0, 0, 0, m_numeric); + + m_extractedDataAreDirty = false; + } +} + +template +typename KLU::Scalar KLU::determinant() const +{ + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + return Scalar(); +} +#endif + +template +template +bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const +{ + Index rhsCols = b.cols(); + EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); + + x = b; + int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + + m_info = info!=0 ? Success : NumericalIssue; + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_KLUSUPPORT_H diff --git a/externals/eigen/Eigen/src/LU/Determinant.h b/externals/eigen/Eigen/src/LU/Determinant.h index d6a3c1e5..3a41e6fc 100644 --- a/externals/eigen/Eigen/src/LU/Determinant.h +++ b/externals/eigen/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } -template -const typename Derived::Scalar bruteforce_det4_helper -(const MatrixBase& matrix, int j, int k, int m, int n) -{ - return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) - * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); -} - template struct determinant_impl @@ -44,7 +37,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +46,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,15 +66,34 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + typedef typename traits::Scalar Scalar; + static EIGEN_DEVICE_FUNC + Scalar run(const Derived& m) + { + Scalar d2_01 = det2(m, 0, 1); + Scalar d2_02 = det2(m, 0, 2); + Scalar d2_03 = det2(m, 0, 3); + Scalar d2_12 = det2(m, 1, 2); + Scalar d2_13 = det2(m, 1, 3); + Scalar d2_23 = det2(m, 2, 3); + Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); + Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); + Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); + Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); + return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + + internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + } +protected: + static EIGEN_DEVICE_FUNC + Scalar det2(const Derived& m, Index i0, Index i1) + { + return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); + } + + static EIGEN_DEVICE_FUNC + Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) { - // trick by Martin Costabel to compute 4x4 det with only 30 muls - return bruteforce_det4_helper(m,0,1,2,3) - - bruteforce_det4_helper(m,0,2,1,3) - + bruteforce_det4_helper(m,0,3,1,2) - + bruteforce_det4_helper(m,1,2,0,3) - - bruteforce_det4_helper(m,1,3,0,2) - + bruteforce_det4_helper(m,2,3,0,1); + return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); } }; @@ -89,6 +104,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/externals/eigen/Eigen/src/LU/FullPivLU.h b/externals/eigen/Eigen/src/LU/FullPivLU.h index 03b6af70..ba1749fa 100644 --- a/externals/eigen/Eigen/src/LU/FullPivLU.h +++ b/externals/eigen/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -48,12 +49,12 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template class FullPivLU @@ -62,9 +63,9 @@ template class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -218,6 +219,7 @@ template class FullPivLU return internal::image_retval(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -320,7 +318,7 @@ template class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits::epsilon() * m_lu.diagonalSize(); + : NumTraits::epsilon() * RealScalar(m_lu.diagonalSize()); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -406,16 +404,16 @@ template class FullPivLU MatrixType reconstructedMatrix() const; - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif @@ -531,8 +529,8 @@ void FullPivLU::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index(i); } break; } @@ -543,8 +541,8 @@ void FullPivLU::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .transpose() + .template conjugateIf() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/externals/eigen/Eigen/src/LU/InverseImpl.h b/externals/eigen/Eigen/src/LU/InverseImpl.h index 018f99b5..a40cefa9 100644 --- a/externals/eigen/Eigen/src/LU/InverseImpl.h +++ b/externals/eigen/Eigen/src/LU/InverseImpl.h @@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper( const MatrixType& matrix, const typename ResultType::Scalar& invdet, ResultType& result) { + typename ResultType::Scalar temp = matrix.coeff(0,0); result.coeffRef(0,0) = matrix.coeff(1,1) * invdet; result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet; result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet; - result.coeffRef(1,1) = matrix.coeff(0,0) * invdet; + result.coeffRef(1,1) = temp * invdet; } template @@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3(matrix) * invdet; + const Scalar c11 = cofactor_3x3(matrix) * invdet; + const Scalar c02 = cofactor_3x3(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template @@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); @@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse::run(matrix_t, inverse); + } } }; @@ -290,6 +301,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +344,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) @@ -345,6 +358,8 @@ inline const Inverse MatrixBase::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -385,6 +400,8 @@ inline void MatrixBase::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. @@ -404,7 +421,7 @@ inline void MatrixBase::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/externals/eigen/Eigen/src/LU/PartialPivLU.h b/externals/eigen/Eigen/src/LU/PartialPivLU.h index d4396188..34aed724 100644 --- a/externals/eigen/Eigen/src/LU/PartialPivLU.h +++ b/externals/eigen/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -69,7 +70,7 @@ struct enable_if_ref,Derived> { * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP(). * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ template class PartialPivLU @@ -79,8 +80,9 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; + friend class SolverBase; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -152,6 +154,7 @@ template class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -217,8 +216,8 @@ template class PartialPivLU MatrixType reconstructedMatrix() const; - inline Index rows() const { return m_lu.rows(); } - inline Index cols() const { return m_lu.cols(); } + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template @@ -231,8 +230,6 @@ template class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; @@ -246,26 +243,21 @@ template class PartialPivLU template EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView().transpose() + .template conjugateIf().solve(rhs); + // Step 2 + m_lu.template triangularView().transpose() + .template conjugateIf().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } @@ -339,17 +331,18 @@ PartialPivLU::PartialPivLU(EigenBase& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template +template struct partial_lu_impl { - // FIXME add a stride to Map, so that the following mapping becomes easier, - // another option would be to create an expression being able to automatically - // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly - // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, - // and Block. - typedef Map > MapLU; - typedef Block MatrixType; - typedef Block BlockType; + static const int UnBlockedBound = 16; + static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + // Remaining rows and columns at compile-time: + static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; + typedef Matrix MatrixType; + typedef Ref MatrixTypeRef; + typedef Ref > BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -362,19 +355,22 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op Scoring; typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); + // For small compile-time matrices it is worth processing the last row separately: + // speedup: +100% for 2x2, +10% for others. + const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < size; ++k) + for(Index k = 0; k < endk; ++k) { - Index rrows = rows-k-1; - Index rcols = cols-k-1; + int rrows = internal::convert_index(rows-k-1); + int rcols = internal::convert_index(cols-k-1); Index row_of_biggest_in_col; Score biggest_in_corner @@ -391,9 +387,7 @@ struct partial_lu_impl ++nb_transpositions; } - // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) - // overflow but not the actual quotient? - lu.col(k).tail(rrows) /= lu.coeff(k,k); + lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -403,8 +397,18 @@ struct partial_lu_impl } if(k(rrows),fix(rcols)).noalias() -= lu.col(k).tail(fix(rrows)) * lu.row(k).tail(fix(rcols)); + } + + // special handling of the last entry + if(UnBlockedAtCompileTime) + { + Index k = endk; + row_transpositions[k] = PivIndex(k); + if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) + first_zero_pivot = k; } + return first_zero_pivot; } @@ -420,18 +424,17 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); - MatrixType lu(lu1,0,0,rows,cols); + MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(size<=16) + if(UnBlockedAtCompileTime || size<=UnBlockedBound) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -457,12 +460,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0(lu,0,0,rows,k); - BlockType A_2(lu,0,k+bs,rows,tsize); - BlockType A11(lu,k,k,bs,bs); - BlockType A12(lu,k,k+bs,bs,tsize); - BlockType A21(lu,k+bs,k,trows,bs); - BlockType A22(lu,k+bs,k+bs,trows,tsize); + BlockType A_0 = lu.block(0,0,rows,k); + BlockType A_2 = lu.block(0,k+bs,rows,tsize); + BlockType A11 = lu.block(k,k,bs,bs); + BlockType A12 = lu.block(k,k+bs,bs,tsize); + BlockType A21 = lu.block(k+bs,k,trows,bs); + BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -501,11 +504,18 @@ struct partial_lu_impl template void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - + < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, + typename TranspositionType::StorageIndex, + EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } @@ -519,7 +529,10 @@ void PartialPivLU::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/externals/eigen/Eigen/src/LU/arch/InverseSize4.h b/externals/eigen/Eigen/src/LU/arch/InverseSize4.h new file mode 100644 index 00000000..a232ffc0 --- /dev/null +++ b/externals/eigen/Eigen/src/LU/arch/InverseSize4.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2001 Intel Corporation +// Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath. +// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, +// adjugate of M and determinant of M respectively. M# is computed block-wise +// using specific formulae. For proof, see: +// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html +// Variable names are adopted from \src\LU\Inverse_SSE.h. +// +// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h +// comes from the following Intel's library: +// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ +// +// Here is the respective copyright and license statement: +// +// Copyright (c) 2001 Intel Corporation. +// +// Permition is granted to use, copy, distribute and prepare derivative works +// of this library for any purpose and without fee, provided, that the above +// copyright notice and this statement appear in all copies. +// Intel makes no representations about the suitability of this software for +// any purpose, and specifically disclaims all warranties. +// See LEGAL.TXT for all the legal information. +// +// TODO: Unify implementations of different data types (i.e. float and double). +#ifndef EIGEN_INVERSE_SIZE_4_H +#define EIGEN_INVERSE_SIZE_4_H + +namespace Eigen +{ +namespace internal +{ +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); + + // Four 2x2 sub-matrices of the input matrix + // input = [[A, B], + // [C, D]] + Packet4f A, B, C, D; + + if (!StorageOrdersMatch) + { + A = vec4f_unpacklo(_L1, _L2); + B = vec4f_unpacklo(_L3, _L4); + C = vec4f_unpackhi(_L1, _L2); + D = vec4f_unpackhi(_L3, _L4); + } + else + { + A = vec4f_movelh(_L1, _L2); + B = vec4f_movehl(_L2, _L1); + C = vec4f_movelh(_L3, _L4); + D = vec4f_movehl(_L4, _L3); + } + + Packet4f AB, DC; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B); + AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1))); + + // DC = D#*C + DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C); + DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1))); + + // determinants of the sub-matrices + Packet4f dA, dB, dC, dD; + + dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A); + dA = psub(dA, vec4f_movehl(dA, dA)); + + dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B); + dB = psub(dB, vec4f_movehl(dB, dB)); + + dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C); + dC = psub(dC, vec4f_movehl(dC, dC)); + + dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D); + dD = psub(dD, vec4f_movehl(dD, dD)); + + Packet4f d, d1, d2; + + d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB); + d = padd(d, vec4f_movehl(d, d)); + d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0)); + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0); + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet4f rd = pdiv(pset1(1.0f), det); + + // Four sub-matrices of the inverse + Packet4f iA, iB, iC, iD; + + // iD = D*|A| - C*A#*B + iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB)); + iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB))); + iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD); + + // iA = A*|D| - B*D#*C + iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC)); + iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC))); + iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0)); + iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1))); + iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0)); + iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1))); + iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC); + + const float sign_mask[4] = {0.0f, numext::bit_cast(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; + const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + rd = pxor(rd, p4f_sign_PNNP); + iA = pmul(iA, rd); + iB = pmul(iB, rd); + iC = pmul(iC, rd); + iD = pmul(iD, rd); + + Index res_stride = result.outerStride(); + float *res = result.data(); + + pstoret(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1)); + pstoret(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0)); + pstoret(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1)); + pstoret(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0)); + } +}; + +#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG)) +// same algorithm as above, except that each operand is split into +// halves for two registers to hold. +template +struct compute_inverse_size4 +{ + enum + { + MatrixAlignment = traits::Alignment, + ResultAlignment = traits::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject>::type + ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower + // row e.g. A1, upper row of A, A2, lower row of A + // input = [[A, B], = [[[A1, [B1, + // [C, D]] A2], B2]], + // [[C1, [D1, + // C2], D2]]] + + Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + + const double* data = matrix.data(); + const Index stride = matrix.innerStride(); + if (StorageOrdersMatch) + { + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + } + else + { + Packet2d temp; + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); + temp = A1; + A1 = vec2d_unpacklo(A1, A2); + A2 = vec2d_unpackhi(temp, A2); + + temp = C1; + C1 = vec2d_unpacklo(C1, C2); + C2 = vec2d_unpackhi(temp, C2); + + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); + + temp = B1; + B1 = vec2d_unpacklo(B1, B2); + B2 = vec2d_unpackhi(temp, B2); + + temp = D1; + D1 = vec2d_unpacklo(D1, D2); + D2 = vec2d_unpackhi(temp, D2); + } + + // determinants of the sub-matrices + Packet2d dA, dB, dC, dD; + + dA = vec2d_swizzle2(A2, A2, 1); + dA = pmul(A1, dA); + dA = psub(dA, vec2d_duplane(dA, 1)); + + dB = vec2d_swizzle2(B2, B2, 1); + dB = pmul(B1, dB); + dB = psub(dB, vec2d_duplane(dB, 1)); + + dC = vec2d_swizzle2(C2, C2, 1); + dC = pmul(C1, dC); + dC = psub(dC, vec2d_duplane(dC, 1)); + + dD = vec2d_swizzle2(D2, D2, 1); + dD = pmul(D1, dD); + dD = psub(dD, vec2d_duplane(dD, 1)); + + Packet2d DC1, DC2, AB1, AB2; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB1 = pmul(B1, vec2d_duplane(A2, 1)); + AB2 = pmul(B2, vec2d_duplane(A1, 0)); + AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1))); + AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0))); + + // DC = D#*C + DC1 = pmul(C1, vec2d_duplane(D2, 1)); + DC2 = pmul(C2, vec2d_duplane(D1, 0)); + DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1))); + DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0))); + + Packet2d d1, d2; + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet2d det; + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet2d rd; + + d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0)); + d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3)); + rd = padd(d1, d2); + rd = padd(rd, vec2d_duplane(rd, 1)); + + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + det = padd(d1, d2); + det = psub(det, rd); + det = vec2d_duplane(det, 0); + rd = pdiv(pset1(1.0), det); + + // rows of four sub-matrices of the inverse + Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; + + // iD = D*|A| - C*A#*B + iD1 = pmul(AB1, vec2d_duplane(C1, 0)); + iD2 = pmul(AB1, vec2d_duplane(C2, 0)); + iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1))); + iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1))); + dA = vec2d_duplane(dA, 0); + iD1 = psub(pmul(D1, dA), iD1); + iD2 = psub(pmul(D2, dA), iD2); + + // iA = A*|D| - B*D#*C + iA1 = pmul(DC1, vec2d_duplane(B1, 0)); + iA2 = pmul(DC1, vec2d_duplane(B2, 0)); + iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1))); + iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1))); + dD = vec2d_duplane(dD, 0); + iA1 = psub(pmul(A1, dD), iA1); + iA2 = psub(pmul(A2, dD), iA2); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1)); + iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1)); + iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2))); + iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2))); + dB = vec2d_duplane(dB, 0); + iB1 = psub(pmul(C1, dB), iB1); + iB2 = psub(pmul(C2, dB), iB2); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1)); + iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1)); + iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2))); + iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2))); + dC = vec2d_duplane(dC, 0); + iC1 = psub(pmul(B1, dC), iC1); + iC2 = psub(pmul(B2, dC), iC2); + + const double sign_mask1[2] = {0.0, numext::bit_cast(0x8000000000000000ull)}; + const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; + const Packet2d sign_PN = ploadu(sign_mask1); + const Packet2d sign_NP = ploadu(sign_mask2); + d1 = pxor(rd, sign_PN); + d2 = pxor(rd, sign_NP); + + Index res_stride = result.outerStride(); + double *res = result.data(); + pstoret(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1)); + pstoret(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2)); + pstoret(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1)); + pstoret(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2)); + pstoret(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1)); + pstoret(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2)); + pstoret(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1)); + pstoret(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2)); + } +}; +#endif +} // namespace internal +} // namespace Eigen +#endif diff --git a/externals/eigen/Eigen/src/LU/arch/Inverse_SSE.h b/externals/eigen/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index ebb64a62..00000000 --- a/externals/eigen/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud -// Copyright (C) 2009 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet( 0); - __m128 _L2 = matrix.template packet( 4); - __m128 _L3 = matrix.template packet( 8); - __m128 _L4 = matrix.template packet(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template -struct compute_inverse_size4 -{ - enum { - MatrixAlignment = traits::Alignment, - ResultAlignment = traits::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet( 0); B1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); B2 = matrix.template packet( 6); - C1 = matrix.template packet( 8); D1 = matrix.template packet(10); - C2 = matrix.template packet(12); D2 = matrix.template packet(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet( 0); C1 = matrix.template packet( 2); - A2 = matrix.template packet( 4); C2 = matrix.template packet( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet( 8); D1 = matrix.template packet(10); - B2 = matrix.template packet(12); D2 = matrix.template packet(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H diff --git a/externals/eigen/Eigen/src/OrderingMethods/Amd.h b/externals/eigen/Eigen/src/OrderingMethods/Amd.h index f91ecb24..7ca3f33b 100644 --- a/externals/eigen/Eigen/src/OrderingMethods/Amd.h +++ b/externals/eigen/Eigen/src/OrderingMethods/Amd.h @@ -2,32 +2,22 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. http://www.suitesparse.com -CSparse is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -CSparse is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this Module; if not, write to the Free Software -Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - +The author of CSparse, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SPARSE_AMD_H #define EIGEN_SPARSE_AMD_H diff --git a/externals/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h b/externals/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h index 933cd564..8e339a70 100644 --- a/externals/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/externals/eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -13,115 +13,119 @@ // Davis (davis@cise.ufl.edu), University of Florida. The algorithm was // developed in collaboration with John Gilbert, Xerox PARC, and Esmond // Ng, Oak Ridge National Laboratory. -// +// // Date: -// +// // September 8, 2003. Version 2.3. -// +// // Acknowledgements: -// +// // This work was supported by the National Science Foundation, under // grants DMS-9504974 and DMS-9803599. -// +// // Notice: -// +// // Copyright (c) 1998-2003 by the University of Florida. // All Rights Reserved. -// +// // THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY // EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. -// +// // Permission is hereby granted to use, copy, modify, and/or distribute // this program, provided that the Copyright, this License, and the // Availability of the original version is retained on all copies and made // accessible to the end-user of any code or package that includes COLAMD -// or any modified version of COLAMD. -// +// or any modified version of COLAMD. +// // Availability: -// +// // The colamd/symamd library is available at -// +// // http://www.suitesparse.com - + #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H namespace internal { + +namespace Colamd { + /* Ensure that debugging is turned off: */ #ifndef COLAMD_NDEBUG #define COLAMD_NDEBUG #endif /* NDEBUG */ + + /* ========================================================================== */ /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ /* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -#define COLAMD_KNOBS 20 +const int NKnobs = 20; /* number of output statistics. Only stats [0..6] are currently used. */ -#define COLAMD_STATS 20 +const int NStats = 20; -/* knobs [0] and stats [0]: dense row knob and output statistic. */ -#define COLAMD_DENSE_ROW 0 +/* Indices into knobs and stats array. */ +enum KnobsStatsIndex { + /* knobs [0] and stats [0]: dense row knob and output statistic. */ + DenseRow = 0, -/* knobs [1] and stats [1]: dense column knob and output statistic. */ -#define COLAMD_DENSE_COL 1 + /* knobs [1] and stats [1]: dense column knob and output statistic. */ + DenseCol = 1, -/* stats [2]: memory defragmentation count output statistic */ -#define COLAMD_DEFRAG_COUNT 2 + /* stats [2]: memory defragmentation count output statistic */ + DefragCount = 2, -/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ -#define COLAMD_STATUS 3 + /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ + Status = 3, -/* stats [4..6]: error info, or info on jumbled columns */ -#define COLAMD_INFO1 4 -#define COLAMD_INFO2 5 -#define COLAMD_INFO3 6 + /* stats [4..6]: error info, or info on jumbled columns */ + Info1 = 4, + Info2 = 5, + Info3 = 6 +}; /* error codes returned in stats [3]: */ -#define COLAMD_OK (0) -#define COLAMD_OK_BUT_JUMBLED (1) -#define COLAMD_ERROR_A_not_present (-1) -#define COLAMD_ERROR_p_not_present (-2) -#define COLAMD_ERROR_nrow_negative (-3) -#define COLAMD_ERROR_ncol_negative (-4) -#define COLAMD_ERROR_nnz_negative (-5) -#define COLAMD_ERROR_p0_nonzero (-6) -#define COLAMD_ERROR_A_too_small (-7) -#define COLAMD_ERROR_col_length_negative (-8) -#define COLAMD_ERROR_row_index_out_of_bounds (-9) -#define COLAMD_ERROR_out_of_memory (-10) -#define COLAMD_ERROR_internal_error (-999) - +enum Status { + Ok = 0, + OkButJumbled = 1, + ErrorANotPresent = -1, + ErrorPNotPresent = -2, + ErrorNrowNegative = -3, + ErrorNcolNegative = -4, + ErrorNnzNegative = -5, + ErrorP0Nonzero = -6, + ErrorATooSmall = -7, + ErrorColLengthNegative = -8, + ErrorRowIndexOutOfBounds = -9, + ErrorOutOfMemory = -10, + ErrorInternalError = -999 +}; /* ========================================================================== */ /* === Definitions ========================================================== */ /* ========================================================================== */ -#define ONES_COMPLEMENT(r) (-(r)-1) +template +IndexType ones_complement(const IndexType r) { + return (-(r)-1); +} /* -------------------------------------------------------------------------- */ - -#define COLAMD_EMPTY (-1) +const int Empty = -1; /* Row and column status */ -#define ALIVE (0) -#define DEAD (-1) +enum RowColumnStatus { + Alive = 0, + Dead = -1 +}; /* Column status */ -#define DEAD_PRINCIPAL (-1) -#define DEAD_NON_PRINCIPAL (-2) - -/* Macros for row and column status update and checking. */ -#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) -#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) -#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) -#define COL_IS_DEAD(c) (Col [c].start < ALIVE) -#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) -#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) -#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } -#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } -#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } +enum ColumnStatus { + DeadPrincipal = -1, + DeadNonPrincipal = -2 +}; /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -129,9 +133,9 @@ namespace internal { // == Row and Column structures == template -struct colamd_col +struct ColStructure { - IndexType start ; /* index for A of first row in this column, or DEAD */ + IndexType start ; /* index for A of first row in this column, or Dead */ /* if column is dead */ IndexType length ; /* number of rows in this column */ union @@ -159,11 +163,21 @@ struct colamd_col IndexType degree_next ; /* next column, if col is in a degree list */ IndexType hash_next ; /* next column, if col is in a hash list */ } shared4 ; - + + inline bool is_dead() const { return start < Alive; } + + inline bool is_alive() const { return start >= Alive; } + + inline bool is_dead_principal() const { return start == DeadPrincipal; } + + inline void kill_principal() { start = DeadPrincipal; } + + inline void kill_non_principal() { start = DeadNonPrincipal; } + }; - + template -struct Colamd_Row +struct RowStructure { IndexType start ; /* index for A of first col in this row */ IndexType length ; /* number of principal columns in this row */ @@ -177,13 +191,19 @@ struct Colamd_Row IndexType mark ; /* for computing set differences and marking dead rows*/ IndexType first_column ;/* first column in row (used in garbage collection) */ } shared2 ; - + + inline bool is_dead() const { return shared2.mark < Alive; } + + inline bool is_alive() const { return shared2.mark >= Alive; } + + inline void kill() { shared2.mark = Dead; } + }; - + /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ - + /* The recommended length Alen of the array A passed to colamd is given by the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any @@ -192,41 +212,41 @@ struct Colamd_Row required for the Col and Row arrays, respectively, which are internal to colamd. An additional n_col space is the minimal amount of "elbow room", and nnz/5 more space is recommended for run time efficiency. - + This macro is not needed when using symamd. - + Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid gcc -pedantic warning messages. */ template -inline IndexType colamd_c(IndexType n_col) -{ return IndexType( ((n_col) + 1) * sizeof (colamd_col) / sizeof (IndexType) ) ; } +inline IndexType colamd_c(IndexType n_col) +{ return IndexType( ((n_col) + 1) * sizeof (ColStructure) / sizeof (IndexType) ) ; } template inline IndexType colamd_r(IndexType n_row) -{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row) / sizeof (IndexType)); } +{ return IndexType(((n_row) + 1) * sizeof (RowStructure) / sizeof (IndexType)); } // Prototypes of non-user callable routines template -static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); +static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure col [], IndexType A [], IndexType p [], IndexType stats[NStats] ); template -static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); +static void init_scoring (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); template -static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); +static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); template -static void order_children (IndexType n_col, colamd_col Col [], IndexType p []); +static void order_children (IndexType n_col, ColStructure Col [], IndexType p []); template -static void detect_super_cols (colamd_col Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; +static void detect_super_cols (ColStructure Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; template -static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType *pfree) ; +static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType *pfree) ; template -static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row [] ) ; +static inline IndexType clear_mark (IndexType n_row, RowStructure Row [] ) ; /* === No debugging ========================================================= */ @@ -240,37 +260,37 @@ static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row /** - * \brief Returns the recommended value of Alen - * - * Returns recommended value of Alen for use by colamd. - * Returns -1 if any input argument is negative. - * The use of this routine or macro is optional. - * Note that the macro uses its arguments more than once, - * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. - * + * \brief Returns the recommended value of Alen + * + * Returns recommended value of Alen for use by colamd. + * Returns -1 if any input argument is negative. + * The use of this routine or macro is optional. + * Note that the macro uses its arguments more than once, + * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. + * * \param nnz nonzeros in A * \param n_row number of rows in A * \param n_col number of columns in A * \return recommended value of Alen for use by colamd */ template -inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col) +inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col) { if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0) return (-1); else - return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); + return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); } /** * \brief set default parameters The use of this routine is optional. - * - * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) + * + * Colamd: rows with more than (knobs [DenseRow] * n_col) * entries are removed prior to ordering. Columns with more than - * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to - * ordering, and placed last in the output column ordering. + * (knobs [DenseCol] * n_row) entries are removed prior to + * ordering, and placed last in the output column ordering. * - * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1, + * DenseRow and DenseCol are defined as 0 and 1, * respectively, in colamd.h. Default values of these two knobs * are both 0.5. Currently, only knobs [0] and knobs [1] are * used, but future versions may use more knobs. If so, they will @@ -279,37 +299,37 @@ inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType * not need to change, assuming that you either use * colamd_set_defaults, or pass a (double *) NULL pointer as the * knobs array to colamd or symamd. - * + * * \param knobs parameter settings for colamd */ -static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) +static inline void set_defaults(double knobs[NKnobs]) { /* === Local variables ================================================== */ - + int i ; if (!knobs) { return ; /* no knobs to initialize */ } - for (i = 0 ; i < COLAMD_KNOBS ; i++) + for (i = 0 ; i < NKnobs ; i++) { knobs [i] = 0 ; } - knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ - knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ + knobs [Colamd::DenseRow] = 0.5 ; /* ignore rows over 50% dense */ + knobs [Colamd::DenseCol] = 0.5 ; /* ignore columns over 50% dense */ } -/** +/** * \brief Computes a column ordering using the column approximate minimum degree ordering - * + * * Computes a column ordering (Q) of A such that P(AQ)=LU or * (AQ)'AQ=LL' have less fill-in and require fewer floating point * operations than factorizing the unpermuted matrix A or A'A, * respectively. - * - * + * + * * \param n_row number of rows in A * \param n_col number of columns in A * \param Alen, size of the array A @@ -319,143 +339,143 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) * \param stats colamd output statistics and error codes */ template -static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS]) +static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats]) { /* === Local variables ================================================== */ - + IndexType i ; /* loop index */ IndexType nnz ; /* nonzeros in A */ IndexType Row_size ; /* size of Row [], in integers */ IndexType Col_size ; /* size of Col [], in integers */ IndexType need ; /* minimum required length of A */ - Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ - colamd_col *Col ; /* pointer into A of Col [0..n_col] array */ + Colamd::RowStructure *Row ; /* pointer into A of Row [0..n_row] array */ + Colamd::ColStructure *Col ; /* pointer into A of Col [0..n_col] array */ IndexType n_col2 ; /* number of non-dense, non-empty columns */ IndexType n_row2 ; /* number of non-dense, non-empty rows */ IndexType ngarbage ; /* number of garbage collections performed */ IndexType max_deg ; /* maximum row degree */ - double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ - - + double default_knobs [NKnobs] ; /* default knobs array */ + + /* === Check the input arguments ======================================== */ - + if (!stats) { COLAMD_DEBUG0 (("colamd: stats not present\n")) ; return (false) ; } - for (i = 0 ; i < COLAMD_STATS ; i++) + for (i = 0 ; i < NStats ; i++) { stats [i] = 0 ; } - stats [COLAMD_STATUS] = COLAMD_OK ; - stats [COLAMD_INFO1] = -1 ; - stats [COLAMD_INFO2] = -1 ; - + stats [Colamd::Status] = Colamd::Ok ; + stats [Colamd::Info1] = -1 ; + stats [Colamd::Info2] = -1 ; + if (!A) /* A is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; + stats [Colamd::Status] = Colamd::ErrorANotPresent ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } - + if (!p) /* p is not present */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; + stats [Colamd::Status] = Colamd::ErrorPNotPresent ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } - + if (n_row < 0) /* n_row must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; - stats [COLAMD_INFO1] = n_row ; + stats [Colamd::Status] = Colamd::ErrorNrowNegative ; + stats [Colamd::Info1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } - + if (n_col < 0) /* n_col must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; - stats [COLAMD_INFO1] = n_col ; + stats [Colamd::Status] = Colamd::ErrorNcolNegative ; + stats [Colamd::Info1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } - + nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; - stats [COLAMD_INFO1] = nnz ; + stats [Colamd::Status] = Colamd::ErrorNnzNegative ; + stats [Colamd::Info1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } - + if (p [0] != 0) { - stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; - stats [COLAMD_INFO1] = p [0] ; + stats [Colamd::Status] = Colamd::ErrorP0Nonzero ; + stats [Colamd::Info1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } - + /* === If no knobs, set default knobs =================================== */ - + if (!knobs) { - colamd_set_defaults (default_knobs) ; + set_defaults (default_knobs) ; knobs = default_knobs ; } - + /* === Allocate the Row and Col arrays from array A ===================== */ - + Col_size = colamd_c (n_col) ; Row_size = colamd_r (n_row) ; need = 2*nnz + n_col + Col_size + Row_size ; - + if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; - stats [COLAMD_INFO1] = need ; - stats [COLAMD_INFO2] = Alen ; + stats [Colamd::Status] = Colamd::ErrorATooSmall ; + stats [Colamd::Info1] = need ; + stats [Colamd::Info2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } - + Alen -= Col_size + Row_size ; - Col = (colamd_col *) &A [Alen] ; - Row = (Colamd_Row *) &A [Alen + Col_size] ; + Col = (ColStructure *) &A [Alen] ; + Row = (RowStructure *) &A [Alen + Col_size] ; /* === Construct the row and column data structures ===================== */ - - if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) + + if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) { /* input matrix is invalid */ COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ; return (false) ; } - + /* === Initialize scores, kill dense rows/columns ======================= */ - Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs, + Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg) ; - + /* === Order the supercolumns =========================================== */ - - ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p, + + ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2*nnz) ; - + /* === Order the non-principal columns ================================== */ - - Eigen::internal::order_children (n_col, Col, p) ; - + + Colamd::order_children (n_col, Col, p) ; + /* === Return statistics in stats ======================================= */ - - stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; - stats [COLAMD_DENSE_COL] = n_col - n_col2 ; - stats [COLAMD_DEFRAG_COUNT] = ngarbage ; - COLAMD_DEBUG0 (("colamd: done.\n")) ; + + stats [Colamd::DenseRow] = n_row - n_row2 ; + stats [Colamd::DenseCol] = n_col - n_col2 ; + stats [Colamd::DefragCount] = ngarbage ; + COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -465,7 +485,6 @@ static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType * /* There are no user-callable routines beyond this point in the file */ - /* ========================================================================== */ /* === init_rows_cols ======================================================= */ /* ========================================================================== */ @@ -485,11 +504,11 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A, of size Alen */ IndexType p [], /* pointers to columns in A, of size n_col+1 */ - IndexType stats [COLAMD_STATS] /* colamd statistics */ + IndexType stats [NStats] /* colamd statistics */ ) { /* === Local variables ================================================== */ @@ -512,24 +531,24 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = Col [col].length ; + stats [Colamd::Status] = Colamd::ErrorColLengthNegative ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } Col [col].shared1.thickness = 1 ; Col [col].shared2.score = 0 ; - Col [col].shared3.prev = COLAMD_EMPTY ; - Col [col].shared4.degree_next = COLAMD_EMPTY ; + Col [col].shared3.prev = Empty ; + Col [col].shared4.degree_next = Empty ; } /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ /* === Scan columns, compute row degrees, and check row indices ========= */ - stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ + stats [Info3] = 0 ; /* number of duplicate or unsorted row indices*/ for (row = 0 ; row < n_row ; row++) { @@ -551,10 +570,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - stats [COLAMD_INFO3] = n_row ; + stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + stats [Colamd::Info3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -563,10 +582,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; - stats [COLAMD_INFO1] = col ; - stats [COLAMD_INFO2] = row ; - (stats [COLAMD_INFO3]) ++ ; + stats [Colamd::Status] = Colamd::OkButJumbled ; + stats [Colamd::Info1] = col ; + stats [Colamd::Info2] = row ; + (stats [Colamd::Info3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -604,7 +623,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === Create row form ================================================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [Status] == OkButJumbled) { /* if cols jumbled, watch for repeated row indices */ for (col = 0 ; col < n_col ; col++) @@ -646,7 +665,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === See if we need to re-create columns ============================== */ - if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) + if (stats [Status] == OkButJumbled) { COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; @@ -701,11 +720,11 @@ static void init_scoring IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ - double knobs [COLAMD_KNOBS],/* parameters */ + double knobs [NKnobs],/* parameters */ IndexType *p_n_row2, /* number of non-dense, non-empty rows */ IndexType *p_n_col2, /* number of non-dense, non-empty columns */ IndexType *p_max_deg /* maximum row degree */ @@ -732,8 +751,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -750,7 +769,7 @@ static void init_scoring { /* this is a empty column, kill and order it last */ Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; @@ -761,7 +780,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip any dead columns */ - if (COL_IS_DEAD (c)) + if (Col[c].is_dead()) { continue ; } @@ -777,7 +796,7 @@ static void init_scoring { Row [*cp++].shared1.degree-- ; } - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } } COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; @@ -791,7 +810,7 @@ static void init_scoring if (deg > dense_row_count || deg == 0) { /* kill a dense or empty row */ - KILL_ROW (r) ; + Row[r].kill() ; --n_row2 ; } else @@ -813,7 +832,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip dead column */ - if (COL_IS_DEAD (c)) + if (Col[c].is_dead()) { continue ; } @@ -826,7 +845,7 @@ static void init_scoring /* get a row */ row = *cp++ ; /* skip if dead */ - if (ROW_IS_DEAD (row)) + if (Row[row].is_dead()) { continue ; } @@ -845,7 +864,7 @@ static void init_scoring /* and have already been killed) */ COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ; Col [c].shared2.order = --n_col2 ; - KILL_PRINCIPAL_COL (c) ; + Col[c].kill_principal() ; } else { @@ -870,7 +889,7 @@ static void init_scoring /* clear the hash buckets */ for (c = 0 ; c <= n_col ; c++) { - head [c] = COLAMD_EMPTY ; + head [c] = Empty ; } min_score = n_col ; /* place in reverse order, so low column indices are at the front */ @@ -878,7 +897,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* only add principal columns to degree lists */ - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n", c, Col [c].shared2.score, min_score, n_col)) ; @@ -891,16 +910,16 @@ static void init_scoring COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (score >= 0) ; COLAMD_ASSERT (score <= n_col) ; - COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [score] >= Empty) ; /* now add this column to dList at proper score location */ next_col = head [score] ; - Col [c].shared3.prev = COLAMD_EMPTY ; + Col [c].shared3.prev = Empty ; Col [c].shared4.degree_next = next_col ; /* if there already was a column with the same score, set its */ /* previous pointer to this new column */ - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { Col [next_col].shared3.prev = c ; } @@ -939,8 +958,8 @@ static IndexType find_ordering /* return the number of garbage collections */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ IndexType Alen, /* size of A, 2*nnz + n_col or larger */ - Colamd_Row Row [], /* of size n_row+1 */ - colamd_col Col [], /* of size n_col+1 */ + RowStructure Row [], /* of size n_row+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ IndexType n_col2, /* Remaining columns to order */ @@ -986,7 +1005,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Initialization and clear mark ==================================== */ max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; min_score = 0 ; ngarbage = 0 ; COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; @@ -1001,10 +1020,10 @@ static IndexType find_ordering /* return the number of garbage collections */ /* make sure degree list isn't empty */ COLAMD_ASSERT (min_score >= 0) ; COLAMD_ASSERT (min_score <= n_col) ; - COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [min_score] >= Empty) ; /* get pivot column from head of minimum degree list */ - while (head [min_score] == COLAMD_EMPTY && min_score < n_col) + while (min_score < n_col && head [min_score] == Empty) { min_score++ ; } @@ -1012,12 +1031,12 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; next_col = Col [pivot_col].shared4.degree_next ; head [min_score] = next_col ; - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { - Col [next_col].shared3.prev = COLAMD_EMPTY ; + Col [next_col].shared3.prev = Empty ; } - COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ; + COLAMD_ASSERT (Col[pivot_col].is_alive()) ; COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ; /* remember score for defrag check */ @@ -1036,12 +1055,12 @@ static IndexType find_ordering /* return the number of garbage collections */ needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { - pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; ngarbage++ ; /* after garbage collection we will have enough */ COLAMD_ASSERT (pfree + needed_memory < Alen) ; /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } @@ -1064,9 +1083,9 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; + COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ; /* skip if row is dead */ - if (ROW_IS_DEAD (row)) + if (Row[row].is_dead()) { continue ; } @@ -1078,7 +1097,7 @@ static IndexType find_ordering /* return the number of garbage collections */ col = *rp++ ; /* add the column, if alive and untagged */ col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && COL_IS_ALIVE (col)) + if (col_thickness > 0 && Col[col].is_alive()) { /* tag column in pivot row */ Col [col].shared1.thickness = -col_thickness ; @@ -1105,7 +1124,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* may be killing an already dead row */ row = *cp++ ; COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - KILL_ROW (row) ; + Row[row].kill() ; } /* === Select a row index to use as the new pivot row =============== */ @@ -1120,7 +1139,7 @@ static IndexType find_ordering /* return the number of garbage collections */ else { /* there is no pivot row, since it is of zero length */ - pivot_row = COLAMD_EMPTY ; + pivot_row = Empty ; COLAMD_ASSERT (pivot_row_length == 0) ; } COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; @@ -1157,7 +1176,7 @@ static IndexType find_ordering /* return the number of garbage collections */ while (rp < rp_end) { col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; COLAMD_DEBUG3 (("Col: %d\n", col)) ; /* clear tags used to construct pivot row pattern */ @@ -1172,8 +1191,8 @@ static IndexType find_ordering /* return the number of garbage collections */ next_col = Col [col].shared4.degree_next ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ; - if (prev_col == COLAMD_EMPTY) + COLAMD_ASSERT (cur_score >= Empty) ; + if (prev_col == Empty) { head [cur_score] = next_col ; } @@ -1181,7 +1200,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { Col [prev_col].shared4.degree_next = next_col ; } - if (next_col != COLAMD_EMPTY) + if (next_col != Empty) { Col [next_col].shared3.prev = prev_col ; } @@ -1194,12 +1213,12 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (Row[row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row != pivot_row) ; set_difference = row_mark - tag_mark ; /* check if the row has been seen yet */ @@ -1215,7 +1234,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (set_difference == 0) { COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - KILL_ROW (row) ; + Row[row].kill() ; } else { @@ -1237,7 +1256,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a column */ col = *rp++ ; - COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; + COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; hash = 0 ; cur_score = 0 ; cp = &A [Col [col].start] ; @@ -1252,12 +1271,12 @@ static IndexType find_ordering /* return the number of garbage collections */ /* get a row */ row = *cp++ ; COLAMD_ASSERT(row >= 0 && row < n_row) ; - row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (ROW_IS_MARKED_DEAD (row_mark)) + if (Row [row].is_dead()) { continue ; } + row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row_mark > tag_mark) ; /* compact the column */ *new_cp++ = row ; @@ -1278,7 +1297,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ; /* nothing left but the pivot row in this column */ - KILL_PRINCIPAL_COL (col) ; + Col[col].kill_principal() ; pivot_row_degree -= Col [col].shared1.thickness ; COLAMD_ASSERT (pivot_row_degree >= 0) ; /* order it */ @@ -1302,7 +1321,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (hash <= n_col) ; head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { /* degree list "hash" is non-empty, use prev (shared3) of */ /* first column in degree list as head of hash bucket */ @@ -1319,7 +1338,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* save hash function in Col [col].shared3.hash */ Col [col].shared3.hash = (IndexType) hash ; - COLAMD_ASSERT (COL_IS_ALIVE (col)) ; + COLAMD_ASSERT (Col[col].is_alive()) ; } } @@ -1329,11 +1348,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ; - Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; + Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; /* === Kill the pivotal column ====================================== */ - KILL_PRINCIPAL_COL (pivot_col) ; + Col[pivot_col].kill_principal() ; /* === Clear mark =================================================== */ @@ -1341,7 +1360,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (tag_mark >= max_mark) { COLAMD_DEBUG2 (("clearing tag_mark\n")) ; - tag_mark = Eigen::internal::clear_mark (n_row, Row) ; + tag_mark = Colamd::clear_mark (n_row, Row) ; } /* === Finalize the new pivot row, and column scores ================ */ @@ -1357,7 +1376,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { col = *rp++ ; /* skip dead columns */ - if (COL_IS_DEAD (col)) + if (Col[col].is_dead()) { continue ; } @@ -1391,11 +1410,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ; + COLAMD_ASSERT (head [cur_score] >= Empty) ; next_col = head [cur_score] ; Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = COLAMD_EMPTY ; - if (next_col != COLAMD_EMPTY) + Col [col].shared3.prev = Empty ; + if (next_col != Empty) { Col [next_col].shared3.prev = col ; } @@ -1448,7 +1467,7 @@ static inline void order_children /* === Parameters ======================================================= */ IndexType n_col, /* number of columns of A */ - colamd_col Col [], /* of size n_col+1 */ + ColStructure Col [], /* of size n_col+1 */ IndexType p [] /* p [0 ... n_col-1] is the column permutation*/ ) { @@ -1464,15 +1483,15 @@ static inline void order_children for (i = 0 ; i < n_col ; i++) { /* find an un-ordered non-principal column */ - COLAMD_ASSERT (COL_IS_DEAD (i)) ; - if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY) + COLAMD_ASSERT (col_is_dead(Col, i)) ; + if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty) { parent = i ; /* once found, find its principal parent */ do { parent = Col [parent].shared1.parent ; - } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; + } while (!Col[parent].is_dead_principal()) ; /* now, order all un-ordered non-principal columns along path */ /* to this parent. collapse tree at the same time */ @@ -1482,7 +1501,7 @@ static inline void order_children do { - COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ; + COLAMD_ASSERT (Col [c].shared2.order == Empty) ; /* order this column */ Col [c].shared2.order = order++ ; @@ -1493,9 +1512,9 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ + /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ - } while (Col [c].shared2.order == COLAMD_EMPTY) ; + } while (Col [c].shared2.order == Empty) ; /* re-order the super_col parent to largest order for this group */ Col [parent].shared2.order = order ; @@ -1547,8 +1566,8 @@ template static void detect_super_cols ( /* === Parameters ======================================================= */ - - colamd_col Col [], /* of size n_col+1 */ + + ColStructure Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A */ IndexType head [], /* head of degree lists and hash buckets */ IndexType row_start, /* pointer to set of columns to check */ @@ -1578,7 +1597,7 @@ static void detect_super_cols while (rp < rp_end) { col = *rp++ ; - if (COL_IS_DEAD (col)) + if (Col[col].is_dead()) { continue ; } @@ -1590,7 +1609,7 @@ static void detect_super_cols /* === Get the first column in this hash bucket ===================== */ head_column = head [hash] ; - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { first_col = Col [head_column].shared3.headhash ; } @@ -1601,10 +1620,10 @@ static void detect_super_cols /* === Consider each column in the hash bucket ====================== */ - for (super_c = first_col ; super_c != COLAMD_EMPTY ; + for (super_c = first_col ; super_c != Empty ; super_c = Col [super_c].shared4.hash_next) { - COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ; + COLAMD_ASSERT (Col [super_c].is_alive()) ; COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ; length = Col [super_c].length ; @@ -1614,10 +1633,10 @@ static void detect_super_cols /* === Compare super_c with all columns after it ================ */ for (c = Col [super_c].shared4.hash_next ; - c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next) + c != Empty ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; - COLAMD_ASSERT (COL_IS_ALIVE (c)) ; + COLAMD_ASSERT (Col[c].is_alive()) ; COLAMD_ASSERT (Col [c].shared3.hash == hash) ; /* not identical if lengths or scores are different */ @@ -1635,10 +1654,10 @@ static void detect_super_cols for (i = 0 ; i < length ; i++) { /* the columns are "clean" (no dead rows) */ - COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; - COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; + COLAMD_ASSERT ( cp1->is_alive() ); + COLAMD_ASSERT ( cp2->is_alive() ); /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ + /* no gather scatter necessary */ if (*cp1++ != *cp2++) { break ; @@ -1658,9 +1677,9 @@ static void detect_super_cols Col [super_c].shared1.thickness += Col [c].shared1.thickness ; Col [c].shared1.parent = super_c ; - KILL_NON_PRINCIPAL_COL (c) ; + Col[c].kill_non_principal() ; /* order c later, in order_children() */ - Col [c].shared2.order = COLAMD_EMPTY ; + Col [c].shared2.order = Empty ; /* remove c from hash bucket */ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; } @@ -1668,15 +1687,15 @@ static void detect_super_cols /* === Empty this hash bucket ======================================= */ - if (head_column > COLAMD_EMPTY) + if (head_column > Empty) { /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = COLAMD_EMPTY ; + Col [head_column].shared3.headhash = Empty ; } else { /* corresponding degree list "hash" is empty */ - head [hash] = COLAMD_EMPTY ; + head [hash] = Empty ; } } } @@ -1688,7 +1707,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns + all available memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. @@ -1698,11 +1717,11 @@ template static IndexType garbage_collection /* returns the new value of pfree */ ( /* === Parameters ======================================================= */ - + IndexType n_row, /* number of rows */ IndexType n_col, /* number of columns */ - Colamd_Row Row [], /* row info */ - colamd_col Col [], /* column info */ + RowStructure Row [], /* row info */ + ColStructure Col [], /* column info */ IndexType A [], /* A [0 ... Alen-1] holds the matrix */ IndexType *pfree /* &A [0] ... pfree is in use */ ) @@ -1721,7 +1740,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ pdest = &A[0] ; for (c = 0 ; c < n_col ; c++) { - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { psrc = &A [Col [c].start] ; @@ -1732,7 +1751,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { r = *psrc++ ; - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { *pdest++ = r ; } @@ -1745,22 +1764,22 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { if (Row [r].length == 0) { - /* this row is of zero length. cannot compact it, so kill it */ - COLAMD_DEBUG3 (("Defrag row kill\n")) ; - KILL_ROW (r) ; + /* this row is of zero length. cannot compact it, so kill it */ + COLAMD_DEBUG3 (("Defrag row kill\n")) ; + Row[r].kill() ; } else { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ONES_COMPLEMENT (r) ; + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + COLAMD_ASSERT (Row[r].is_alive()) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ones_complement(r) ; } } @@ -1776,11 +1795,11 @@ static IndexType garbage_collection /* returns the new value of pfree */ { psrc-- ; /* get the row index */ - r = ONES_COMPLEMENT (*psrc) ; + r = ones_complement(*psrc) ; COLAMD_ASSERT (r >= 0 && r < n_row) ; /* restore first column index */ *psrc = Row [r].shared2.first_column ; - COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; + COLAMD_ASSERT (Row[r].is_alive()) ; /* move and compact the row */ COLAMD_ASSERT (pdest <= psrc) ; @@ -1789,7 +1808,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { c = *psrc++ ; - if (COL_IS_ALIVE (c)) + if (Col[c].is_alive()) { *pdest++ = c ; } @@ -1821,7 +1840,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ /* === Parameters ======================================================= */ IndexType n_row, /* number of rows in A */ - Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ + RowStructure Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ ) { /* === Local variables ================================================== */ @@ -1830,7 +1849,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ for (r = 0 ; r < n_row ; r++) { - if (ROW_IS_ALIVE (r)) + if (Row[r].is_alive()) { Row [r].shared2.mark = 0 ; } @@ -1838,6 +1857,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ return (1) ; } +} // namespace Colamd -} // namespace internal +} // namespace internal #endif diff --git a/externals/eigen/Eigen/src/OrderingMethods/Ordering.h b/externals/eigen/Eigen/src/OrderingMethods/Ordering.h index 7ea9b14d..c5789701 100644 --- a/externals/eigen/Eigen/src/OrderingMethods/Ordering.h +++ b/externals/eigen/Eigen/src/OrderingMethods/Ordering.h @@ -31,15 +31,13 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) - it.valueRef() = 0.0; + it.valueRef() = typename MatrixType::Scalar(0); } symmat = C + A; } } -#ifndef EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class AMDOrdering * @@ -81,8 +79,6 @@ class AMDOrdering } }; -#endif // EIGEN_MPL2_ONLY - /** \ingroup OrderingMethods_Module * \class NaturalOrdering * @@ -133,17 +129,17 @@ class COLAMDOrdering StorageIndex n = StorageIndex(mat.cols()); StorageIndex nnz = StorageIndex(mat.nonZeros()); // Get the recommended value of Alen to be used by colamd - StorageIndex Alen = internal::colamd_recommended(nnz, m, n); + StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); // Set the default parameters - double knobs [COLAMD_KNOBS]; - StorageIndex stats [COLAMD_STATS]; - internal::colamd_set_defaults(knobs); + double knobs [internal::Colamd::NKnobs]; + StorageIndex stats [internal::Colamd::NStats]; + internal::Colamd::set_defaults(knobs); IndexVector p(n+1), A(Alen); for(StorageIndex i=0; i <= n; i++) p(i) = mat.outerIndexPtr()[i]; for(StorageIndex i=0; i < nnz; i++) A(i) = mat.innerIndexPtr()[i]; // Call Colamd routine to compute the ordering - StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); + StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); EIGEN_UNUSED_VARIABLE(info); eigen_assert( info && "COLAMD failed " ); diff --git a/externals/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h b/externals/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h index d2ebfd7b..37426877 100644 --- a/externals/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/externals/eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -64,28 +64,28 @@ namespace internal typedef typename _MatrixType::StorageIndex StorageIndex; }; - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast(vals), perm, invp, reinterpret_cast(x), nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/externals/eigen/Eigen/src/PardisoSupport/PardisoSupport.h b/externals/eigen/Eigen/src/PardisoSupport/PardisoSupport.h index 091c3970..f89b79bd 100644 --- a/externals/eigen/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/externals/eigen/Eigen/src/PardisoSupport/PardisoSupport.h @@ -123,6 +123,7 @@ class PardisoImpl : public SparseSolverBase }; PardisoImpl() + : m_analysisIsOk(false), m_factorizationIsOk(false) { eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type"); m_iparm.setZero(); @@ -140,7 +141,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const @@ -385,14 +386,15 @@ class PardisoLU : public PardisoImpl< PardisoLU > { protected: typedef PardisoImpl Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLU >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; + using Base::compute; using Base::solve; @@ -440,14 +442,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > { protected: typedef PardisoImpl< PardisoLLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; enum { UpLo = _UpLo }; using Base::compute; @@ -503,14 +505,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT > { protected: typedef PardisoImpl< PardisoLDLT > Base; - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLDLT >; public: + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; using Base::compute; enum { UpLo = Options&(Upper|Lower) }; diff --git a/externals/eigen/Eigen/src/QR/ColPivHouseholderQR.h b/externals/eigen/Eigen/src/QR/ColPivHouseholderQR.h index 0e47c833..9b677e9b 100644 --- a/externals/eigen/Eigen/src/QR/ColPivHouseholderQR.h +++ b/externals/eigen/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,6 +17,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -46,20 +49,19 @@ template struct traits > * \sa MatrixBase::colPivHouseholderQr() */ template class ColPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type IntRowVectorType; @@ -156,6 +158,7 @@ template class ColPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -172,11 +175,8 @@ template class ColPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. @@ -416,8 +416,10 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -506,8 +508,8 @@ void ColPivHouseholderQR::computeInPlace() m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k); } - RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); - RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); + RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_maxpivot = RealScalar(0); @@ -553,12 +555,12 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNormsUpdated.coeffRef(j) != 0) { + if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); - temp = temp < 0 ? 0 : temp; - RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / - m_colNormsDirect.coeffRef(j)); + temp = temp < RealScalar(0) ? RealScalar(0) : temp; + RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / + m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { // The updated norm has become too inaccurate so re-compute the column // norm directly. @@ -584,8 +586,6 @@ template template void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -596,11 +596,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) - .setLength(nonzero_pivots) - .transpose() - ); + c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() @@ -609,6 +605,31 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } + +template +template +void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index nonzero_pivots = nonzeroPivots(); + + if(nonzero_pivots == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); + + m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(nonzero_pivots)); + + dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); + dst.bottomRows(rows()-nonzero_pivots).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf() ); +} #endif namespace internal { diff --git a/externals/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/externals/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b7..486d3373 100644 --- a/externals/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/externals/eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,6 +16,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -44,19 +47,21 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template -class CompleteOrthogonalDecomposition { +template class CompleteOrthogonalDecomposition + : public SolverBase > +{ public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + + template + friend struct internal::solve_assertion; + + EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; @@ -131,9 +136,9 @@ class CompleteOrthogonalDecomposition { m_temp(matrix.cols()) { computeInPlace(); - } - + } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -145,11 +150,8 @@ class CompleteOrthogonalDecomposition { */ template inline const Solve solve( - const MatrixBase& b) const { - eigen_assert(m_cpqr.m_isInitialized && - "CompleteOrthogonalDecomposition is not initialized."); - return Solve(*this, b.derived()); - } + const MatrixBase& b) const; + #endif HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -158,8 +160,8 @@ class CompleteOrthogonalDecomposition { */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZAdjointOnTheLeftInPlace(Z); - return Z.adjoint(); + applyZOnTheLeftInPlace(Z); + return Z; } /** \returns a reference to the matrix where the complete orthogonal @@ -275,6 +277,7 @@ class CompleteOrthogonalDecomposition { */ inline const Inverse pseudoInverse() const { + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse(*this); } @@ -353,7 +356,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility @@ -367,7 +370,10 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -375,8 +381,22 @@ class CompleteOrthogonalDecomposition { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); + eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); + } + void computeInPlace(); + /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or + * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate + * is set to \c true. + */ + template + void applyZOnTheLeftInPlace(Rhs& rhs) const; + /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template @@ -452,7 +472,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -464,6 +484,28 @@ void CompleteOrthogonalDecomposition::computeInPlace() } } +template +template +void CompleteOrthogonalDecomposition::applyZOnTheLeftInPlace( + Rhs& rhs) const { + const Index cols = this->cols(); + const Index nrhs = rhs.cols(); + const Index rank = this->rank(); + Matrix temp((std::max)(cols, nrhs)); + for (Index k = rank-1; k >= 0; --k) { + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + rhs.middleRows(rank - 1, cols - rank + 1) + .applyHouseholderOnTheLeft( + matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf(), zCoeffs().template conjugateIf()(k), + &temp(0)); + if (k != rank - 1) { + rhs.row(k).swap(rhs.row(rank - 1)); + } + } +} + template template void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( @@ -471,7 +513,7 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); + Matrix temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -491,8 +533,6 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { - eigen_assert(rhs.rows() == this->rows()); - const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -500,11 +540,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs - // Note that the matrix Q = H_0^* H_1^*... so its inverse is - // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft( - householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() @@ -523,10 +560,45 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } + +template +template +void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = this->rank(); + + if (rank == 0) { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); + + if (rank < cols()) { + applyZOnTheLeftInPlace(c); + } + + matrixT().topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif namespace internal { +template +struct traits > > + : traits::PlainObject> +{ + enum { Flags = 0 }; +}; + template struct Assignment >, internal::assign_op::Scalar>, Dense2Dense> { @@ -534,7 +606,8 @@ struct Assignment SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { - dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows())); + typedef Matrix IdentityMatrixType; + dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols())); } }; diff --git a/externals/eigen/Eigen/src/QR/FullPivHouseholderQR.h b/externals/eigen/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc..d0664a1d 100644 --- a/externals/eigen/Eigen/src/QR/FullPivHouseholderQR.h +++ b/externals/eigen/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,6 +18,9 @@ namespace internal { template struct traits > : traits<_MatrixType> { + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -55,20 +58,19 @@ struct traits > * \sa MatrixBase::fullPivHouseholderQr() */ template class FullPivHouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix class FullPivHouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -173,11 +176,8 @@ template class FullPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** \returns Expression object representing the matrix Q */ @@ -392,22 +392,24 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; @@ -499,15 +501,15 @@ void FullPivHouseholderQR::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = i; - m_cols_transpositions.coeffRef(i) = i; + m_rows_transpositions.coeffRef(i) = internal::convert_index(i); + m_cols_transpositions.coeffRef(i) = internal::convert_index(i); m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; - m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; + m_rows_transpositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); + m_cols_transpositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -541,7 +543,6 @@ template template void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -554,7 +555,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix temp(rhs.cols()); + Matrix temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -571,6 +572,42 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } + +template +template +void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index l_rank = rank(); + + if(l_rank == 0) + { + dst.setZero(); + return; + } + + typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); + + m_qr.topLeftCorner(l_rank, l_rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(l_rank)); + + dst.topRows(l_rank) = c.topRows(l_rank); + dst.bottomRows(rows()-l_rank).setZero(); + + Matrix temp(dst.cols()); + const Index size = (std::min)(rows(), cols()); + for (Index k = size-1; k >= 0; --k) + { + Index remainingSize = rows()-k; + + dst.bottomRightCorner(remainingSize, dst.cols()) + .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf(), + m_hCoeffs.template conjugateIf().coeff(k), &temp.coeffRef(0)); + + dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); + } +} #endif namespace internal { diff --git a/externals/eigen/Eigen/src/QR/HouseholderQR.h b/externals/eigen/Eigen/src/QR/HouseholderQR.h index 3513d995..801739fb 100644 --- a/externals/eigen/Eigen/src/QR/HouseholderQR.h +++ b/externals/eigen/Eigen/src/QR/HouseholderQR.h @@ -14,6 +14,18 @@ namespace Eigen { +namespace internal { +template struct traits > + : traits<_MatrixType> +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; + +} // end namespace internal + /** \ingroup QR_Module * * @@ -42,20 +54,19 @@ namespace Eigen { * \sa MatrixBase::householderQr() */ template class HouseholderQR + : public SolverBase > { public: typedef _MatrixType MatrixType; + typedef SolverBase Base; + friend class SolverBase; + + EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { - RowsAtCompileTime = MatrixType::RowsAtCompileTime, - ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - // FIXME should be int - typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; @@ -121,6 +132,7 @@ template class HouseholderQR computeInPlace(); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -137,11 +149,8 @@ template class HouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const - { - eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); - return Solve(*this, b.derived()); - } + solve(const MatrixBase& b) const; + #endif /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -204,28 +213,30 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -292,7 +303,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { @@ -350,15 +361,10 @@ template void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); - eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence( - m_qr.leftCols(rank), - m_hCoeffs.head(rank)).transpose() - ); + c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); m_qr.topLeftCorner(rank, rank) .template triangularView() @@ -367,6 +373,25 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } + +template +template +void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + const Index rank = (std::min)(rows(), cols()); + + typename RhsType::PlainObject c(rhs); + + m_qr.topLeftCorner(rank, rank) + .template triangularView() + .transpose().template conjugateIf() + .solveInPlace(c.topRows(rank)); + + dst.topRows(rank) = c.topRows(rank); + dst.bottomRows(rows()-rank).setZero(); + + dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); +} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/externals/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/externals/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 953d57c9..013c7ae7 100644 --- a/externals/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/externals/eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -74,13 +74,35 @@ class SPQR : public SparseSolverBase > }; public: SPQR() - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) + : m_analysisIsOk(false), + m_factorizationIsOk(false), + m_isRUpToDate(false), + m_ordering(SPQR_ORDERING_DEFAULT), + m_allow_tol(SPQR_DEFAULT_TOL), + m_tolerance (NumTraits::epsilon()), + m_cR(0), + m_E(0), + m_H(0), + m_HPinv(0), + m_HTau(0), + m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -220,7 +242,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/externals/eigen/Eigen/src/SVD/BDCSVD.h b/externals/eigen/Eigen/src/SVD/BDCSVD.h index 25fca6f4..17f8e443 100644 --- a/externals/eigen/Eigen/src/SVD/BDCSVD.h +++ b/externals/eigen/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014-2016 Gael Guennebaud +// Copyright (C) 2014-2017 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -22,6 +22,11 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS +#ifdef EIGEN_BDCSVD_SANITY_CHECKS +#undef eigen_internal_assert +#define eigen_internal_assert(X) assert(X); +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -34,6 +39,7 @@ namespace internal { template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -57,7 +63,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * @@ -77,6 +83,7 @@ class BDCSVD : public SVDBase > typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Literal Literal; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -104,7 +111,7 @@ class BDCSVD : public SVDBase > * The default constructor is useful in cases in which the user intends to * perform decompositions via BDCSVD::compute(const MatrixType&). */ - BDCSVD() : m_algoswap(16), m_numIters(0) + BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {} @@ -201,6 +208,7 @@ class BDCSVD : public SVDBase > using Base::m_computeThinV; using Base::m_matrixU; using Base::m_matrixV; + using Base::m_info; using Base::m_isInitialized; using Base::m_nonzeroSingularValues; @@ -211,7 +219,7 @@ class BDCSVD : public SVDBase > // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -249,17 +257,26 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign { // FIXME this line involves temporaries JacobiSVD jsvd(matrix,computationOptions); - if(computeU()) m_matrixU = jsvd.matrixU(); - if(computeV()) m_matrixV = jsvd.matrixV(); - m_singularValues = jsvd.singularValues(); - m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); m_isInitialized = true; + m_info = jsvd.info(); + if (m_info == Success || m_info == NoConvergence) { + if(computeU()) m_matrixU = jsvd.matrixU(); + if(computeV()) m_matrixV = jsvd.matrixV(); + m_singularValues = jsvd.singularValues(); + m_nonzeroSingularValues = jsvd.nonzeroSingularValues(); + } return *this; } //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); - if(scale==RealScalar(0)) scale = RealScalar(1); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } + + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; else copy = matrix/scale; @@ -275,7 +292,11 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose(); m_computed.template bottomRows<1>().setZero(); divide(0, m_diagSize - 1, 0, 0, 0); - + if (m_info != Success && m_info != NoConvergence) { + m_isInitialized = true; + return *this; + } + //**** step 3 - Copy singular values and vectors for (int i=0; i::structured_update(Block A, co Index k1=0, k2=0; for(Index j=0; j::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::divide(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -407,6 +428,8 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, { // FIXME this line involves temporaries JacobiSVD b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0)); + m_info = b.info(); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU(); else @@ -426,7 +449,9 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the // right submatrix before the left one. divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift); + if (m_info != Success && m_info != NoConvergence) return; divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1); + if (m_info != Success && m_info != NoConvergence) return; if (m_compU) { @@ -449,11 +474,11 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, l = m_naiveU.row(1).segment(firstCol, k); f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } - if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; + if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1); if (r0::divide (Index firstCol, Index lastCol, Index firstRowW, // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); - diag(0) = 0; + diag(0) = Literal(0); // Allocate space for singular values and vectors singVals.resize(n); @@ -590,7 +615,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==0) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -617,13 +642,11 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec std::cout << " shift: " << shifts.transpose() << "\n"; { - Index actual_n = n; - while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; - std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; + assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif @@ -651,13 +674,13 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(U.allFinite()); - assert(V.allFinite()); - assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); - assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); + assert(U.allFinite()); + assert(V.allFinite()); +// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -672,6 +695,15 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec if(m_compV) V.col(i).swap(V.col(i+1)); } } + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + { + bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); + if(!singular_values_sorted) + std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; + assert(singular_values_sorted); + } +#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -691,11 +723,13 @@ template typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) { Index m = perm.size(); - RealScalar res = 1; + RealScalar res = Literal(1); for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d { using std::abs; using std::swap; + using std::sqrt; Index n = col0.size(); Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above + // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. + while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) { - if (col0(k) == 0 || actual_n==1) + if (col0(k) == Literal(0) || actual_n==1) { // if col0(k) == 0, then entry is deflated, so singular value is on diagonal // if actual_n==1, then the deflated problem is already diagonalized singVals(k) = k==0 ? col0(0) : diag(k); - mus(k) = 0; + mus(k) = Literal(0); shifts(k) = k==0 ? col0(0) : diag(k); continue; } @@ -731,35 +768,55 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d right = (diag(actual_n-1) + col0.matrix().norm()); else { - // Skip deflated singular values + // Skip deflated singular values, + // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. + // This should be equivalent to using perm[] Index l = k+1; - while(col0(l)==0) { ++l; eigen_internal_assert(l 0) ? left : right; + RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); diagShifted = diag - shift; + + if(k!=actual_n-1) + { + // check that after the shift, f(mid) is still negative: + RealScalar midShifted = (right - left) / RealScalar(2); + if(shift==right) + midShifted = -midShifted; + RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + if(fMidShifted>0) + { + // fMid was erroneous, fix it: + shift = fMidShifted > Literal(0) ? left : right; + diagShifted = diag - shift; + } + } // initial guess RealScalar muPrev, muCur; @@ -785,26 +842,29 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate - bool useBisection = fPrev*fCur>0; - while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) + bool useBisection = fPrev*fCur>Literal(0); + while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) { ++m_numIters; // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples. - RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev); + RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev); RealScalar b = fCur - a / muCur; // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(fZero)); +#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - - if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true; - if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true; + if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; + if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; } @@ -817,54 +877,100 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = (std::numeric_limits::min)(); + // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), + // the factor 2 is to be more conservative + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + + // check that we did it right: + eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: - // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe + // if (k == 0) rightShifted = right - left; else + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * RealScalar(0.6); - rightShifted = -(std::numeric_limits::min)(); + leftShifted = -(right - left) * RealScalar(0.51); + if(k+1( (std::numeric_limits::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + else + rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert(fLeft " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; " + << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n"; + std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " + << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift + << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) + << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n"; } #endif - eigen_internal_assert(fLeft * fRight < 0); - - while (rightShifted - leftShifted > 2 * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + eigen_internal_assert(fLeft * fRight < Literal(0)); + + if(fLeft Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) { - rightShifted = midShifted; - } - else - { - leftShifted = midShifted; - fLeft = fMid; + RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); + fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert((numext::isfinite)(fMid)); + + if (fLeft * fMid < Literal(0)) + { + rightShifted = midShifted; + } + else + { + leftShifted = midShifted; + fLeft = fMid; + } } + muCur = (leftShifted + rightShifted) / Literal(2); + } + else + { + // We have a problem as shifting on the left or right give either a positive or negative value + // at the middle of [left,right]... + // Instead fo abbording or entering an infinite loop, + // let's just use the middle as the estimated zero-crossing: + muCur = (right - left) * RealScalar(0.5); + if(shift == right) + muCur = -muCur; } - - muCur = (leftShifted + rightShifted) / 2; } singVals[k] = shift + muCur; shifts[k] = shift; mus[k] = muCur; +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE + if(k+1=singVals[k-1]); + assert(singVals[k]>=diag(k)); +#endif + // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -888,37 +994,68 @@ void BDCSVD::perturbCol0 zhat.setZero(); return; } - Index last = perm(m-1); + Index lastIdx = perm(m-1); // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { - if (col0(k) == 0) // deflated - zhat(k) = 0; + if (col0(k) == Literal(0)) // deflated + zhat(k) = Literal(0); else { // see equation (3.6) RealScalar dk = diag(k); - RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); + RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk)); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(prod<0) { + std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; + std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; + std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; + } + assert(prod>=0); +#endif for(Index l = 0; l=k && (l==0 || l-1>=m)) + { + std::cout << "Error in perturbCol0\n"; + std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; + std::cout << " " <=0); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) + if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n"; #endif } } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; + std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); - zhat(k) = col0(k) > 0 ? tmp : -tmp; +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((numext::isfinite)(tmp)); +#endif + zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } } @@ -934,7 +1071,7 @@ void BDCSVD::computeSingVecs for (Index k = 0; k < n; ++k) { - if (zhat(k) == 0) + if (zhat(k) == Literal(0)) { U.col(k) = VectorType::Unit(n+1, k); if (m_compV) V.col(k) = VectorType::Unit(n, k); @@ -947,7 +1084,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - U(n,k) = 0; + U(n,k) = Literal(0); U.col(k).normalize(); if (m_compV) @@ -958,7 +1095,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - V(0,k) = -1; + V(0,k) = Literal(-1); V.col(k).normalize(); } } @@ -971,7 +1108,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -979,15 +1116,15 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index Index start = firstCol + shift; RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); - RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); - if (r == 0) + RealScalar r = numext::hypot(c,s); + if (r == Literal(0)) { - m_computed(start+i, start+i) = 0; + m_computed(start+i, start+i) = Literal(0); return; } m_computed(start,start) = r; - m_computed(start+i, start) = 0; - m_computed(start+i, start+i) = 0; + m_computed(start+i, start) = Literal(0); + m_computed(start+i, start+i) = Literal(0); JacobiRotation J(c/r,-s/r); if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J); @@ -1000,7 +1137,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1020,16 +1157,16 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi << m_computed(firstColm + i+1, firstColm+i+1) << " " << m_computed(firstColm + i+2, firstColm+i+2) << "\n"; #endif - if (r==0) + if (r==Literal(0)) { m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j); return; } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); - m_computed(firstColm + j, firstColm) = 0; + m_computed(firstColm + j, firstColm) = Literal(0); JacobiRotation J(c,-s); if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J); @@ -1040,7 +1177,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs; @@ -1053,7 +1190,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); - RealScalar epsilon_coarse = 8 * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); + RealScalar epsilon_coarse = Literal(8) * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert(m_naiveU.allFinite()); @@ -1081,7 +1218,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << " (diag(" << i << ")=" << diag(i) << ")\n"; #endif - col0(i) = 0; + col0(i) = Literal(0); } //condition 4.3 @@ -1101,6 +1238,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; + std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1191,7 +1329,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1))::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm @@ -1223,7 +1360,6 @@ MatrixBase::bdcSvd(unsigned int computationOptions) const { return BDCSVD(*this, computationOptions); } -#endif } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/SVD/JacobiSVD.h b/externals/eigen/Eigen/src/SVD/JacobiSVD.h index 43488b1e..9d95acdf 100644 --- a/externals/eigen/Eigen/src/SVD/JacobiSVD.h +++ b/externals/eigen/Eigen/src/SVD/JacobiSVD.h @@ -112,12 +112,12 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -202,13 +202,12 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -303,8 +302,9 @@ class qr_preconditioner_impl - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -425,6 +425,7 @@ struct svd_precondition_2x2_block_to_be_real template struct traits > + : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -584,6 +585,7 @@ template class JacobiSVD using Base::m_matrixU; using Base::m_matrixV; using Base::m_singularValues; + using Base::m_info; using Base::m_isInitialized; using Base::m_isAllocated; using Base::m_usePrescribedThreshold; @@ -610,7 +612,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); @@ -624,6 +626,7 @@ void JacobiSVD::allocate(Index rows, Index cols, u m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; @@ -673,7 +676,12 @@ JacobiSVD::compute(const MatrixType& matrix, unsig const RealScalar considerAsZero = (std::numeric_limits::min)(); // Scaling factor to reduce over/under-flows - RealScalar scale = matrix.cwiseAbs().maxCoeff(); + RealScalar scale = matrix.cwiseAbs().template maxCoeff(); + if (!(numext::isfinite)(scale)) { + m_isInitialized = true; + m_info = InvalidInput; + return *this; + } if(scale==RealScalar(0)) scale = RealScalar(1); /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */ diff --git a/externals/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/externals/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 50272154..ff0516f6 100644 --- a/externals/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/externals/eigen/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -61,9 +61,10 @@ JacobiSVD, ColPiv u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ diff --git a/externals/eigen/Eigen/src/SVD/SVDBase.h b/externals/eigen/Eigen/src/SVD/SVDBase.h index cc90a3b7..bc7ab88b 100644 --- a/externals/eigen/Eigen/src/SVD/SVDBase.h +++ b/externals/eigen/Eigen/src/SVD/SVDBase.h @@ -17,6 +17,18 @@ #define EIGEN_SVDBASE_H namespace Eigen { + +namespace internal { +template struct traits > + : traits +{ + typedef MatrixXpr XprKind; + typedef SolverStorage StorageKind; + typedef int StorageIndex; + enum { Flags = 0 }; +}; +} + /** \ingroup SVD_Module * * @@ -39,20 +51,26 @@ namespace Eigen { * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving. + * + * The status of the computation can be retrived using the \a info() method. Unless \a info() returns \a Success, the results should be not + * considered well defined. * - * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to + * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will return \a InvalidInput, but the computation is guaranteed to * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template -class SVDBase +template class SVDBase + : public SolverBase > { +public: + + template + friend struct internal::solve_assertion; -public: typedef typename internal::traits::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename Eigen::internal::traits::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -82,7 +100,7 @@ class SVDBase */ const MatrixUType& matrixU() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?"); return m_matrixU; } @@ -98,7 +116,7 @@ class SVDBase */ const MatrixVType& matrixV() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?"); return m_matrixV; } @@ -110,14 +128,14 @@ class SVDBase */ const SingularValuesType& singularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_singularValues; } /** \returns the number of singular values that are not exactly 0 */ Index nonzeroSingularValues() const { - eigen_assert(m_isInitialized && "SVD is not initialized."); + _check_compute_assertions(); return m_nonzeroSingularValues; } @@ -130,7 +148,7 @@ class SVDBase inline Index rank() const { using std::abs; - eigen_assert(m_isInitialized && "JacobiSVD is not initialized."); + _check_compute_assertions(); if(m_singularValues.size()==0) return 0; RealScalar premultiplied_threshold = numext::maxi(m_singularValues.coeff(0) * threshold(), (std::numeric_limits::min)()); Index i = m_nonzeroSingularValues-1; @@ -180,8 +198,10 @@ class SVDBase RealScalar threshold() const { eigen_assert(m_isInitialized || m_usePrescribedThreshold); + // this temporary is needed to workaround a MSVC issue + Index diagSize = (std::max)(1,m_diagSize); return m_usePrescribedThreshold ? m_prescribedThreshold - : (std::max)(1,m_diagSize)*NumTraits::epsilon(); + : RealScalar(diagSize)*NumTraits::epsilon(); } /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */ @@ -192,6 +212,7 @@ class SVDBase inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -203,32 +224,55 @@ class SVDBase */ template inline const Solve - solve(const MatrixBase& b) const + solve(const MatrixBase& b) const; + #endif + + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful. + */ + EIGEN_DEVICE_FUNC + ComputationInfo info() const { eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); - return Solve(derived(), b.derived()); + return m_info; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; + + template + void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + + void _check_compute_assertions() const { + eigen_assert(m_isInitialized && "SVD is not initialized."); + } + + template + void _check_solve_assertion(const Rhs& b) const { + EIGEN_ONLY_USED_FOR_DEBUG(b); + _check_compute_assertions(); + eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); + eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); + } + // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; MatrixUType m_matrixU; MatrixVType m_matrixV; SingularValuesType m_singularValues; + ComputationInfo m_info; bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold; bool m_computeFullU, m_computeThinU; bool m_computeFullV, m_computeThinV; @@ -241,9 +285,14 @@ class SVDBase * Default constructor of SVDBase */ SVDBase() - : m_isInitialized(false), + : m_info(Success), + m_isInitialized(false), m_isAllocated(false), m_usePrescribedThreshold(false), + m_computeFullU(false), + m_computeThinU(false), + m_computeFullV(false), + m_computeThinV(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) { @@ -258,17 +307,30 @@ template template void SVDBase::_solve_impl(const RhsType &rhs, DstType &dst) const { - eigen_assert(rhs.rows() == rows()); - // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix tmp; + Matrix tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } + +template +template +void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const +{ + // A = U S V^* + // So A^{-*} = U S^{-1} V^* + // And A^{-T} = U_conj S^{-1} V^T + Matrix tmp; + Index l_rank = rank(); + + tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf() * rhs; + tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; + dst = m_matrixU.template conjugateIf().leftCols(l_rank) * tmp; +} #endif template @@ -286,6 +348,7 @@ bool SVDBase::allocate(Index rows, Index cols, unsigned int computat m_rows = rows; m_cols = cols; + m_info = Success; m_isInitialized = false; m_isAllocated = true; m_computationOptions = computationOptions; diff --git a/externals/eigen/Eigen/src/SVD/UpperBidiagonalization.h b/externals/eigen/Eigen/src/SVD/UpperBidiagonalization.h index 0b146089..997defc4 100644 --- a/externals/eigen/Eigen/src/SVD/UpperBidiagonalization.h +++ b/externals/eigen/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); } } @@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, traits::Flags & RowMajorBit> > Y) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename NumTraits::Literal Literal; enum { StorageOrder = traits::Flags & RowMajorBit }; typedef InnerStride ColInnerStride; typedef InnerStride RowInnerStride; @@ -200,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -229,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); @@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, SubMatType A10( A.block(bs,0, brows-bs,bs) ); SubMatType A01( A.block(0,bs, bs,bcols-bs) ); Scalar tmp = A01(bs-1,0); - A01(bs-1,0) = 1; + A01(bs-1,0) = Literal(1); A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint(); A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01; A01(bs-1,0) = tmp; diff --git a/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h b/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f652..9f93e325 100644 --- a/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -80,11 +80,19 @@ class SimplicialCholeskyBase : public SparseSolverBase /** Default constructor */ SimplicialCholeskyBase() - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) {} explicit SimplicialCholeskyBase(const MatrixType& matrix) - : m_info(Success), m_shiftOffset(0), m_shiftScale(1) + : m_info(Success), + m_factorizationIsOk(false), + m_analysisIsOk(false), + m_shiftOffset(0), + m_shiftScale(1) { derived().compute(matrix); } @@ -101,7 +109,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -210,7 +218,7 @@ class SimplicialCholeskyBase : public SparseSolverBase CholMatrixType tmp(size,size); ConstCholMatrixPtr pmat; - if(m_P.size()==0 && (UpLo&Upper)==Upper) + if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper) { // If there is no ordering, try to directly use the input matrix without any copy internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); @@ -279,8 +287,8 @@ template struct traits CholMatrixType; typedef TriangularView MatrixL; typedef TriangularView MatrixU; - static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); } - static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); } + static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); } + static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); } }; template struct traits > @@ -293,8 +301,8 @@ template struct traits CholMatrixType; typedef TriangularView MatrixL; typedef TriangularView MatrixU; - static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); } - static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); } + static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); } + static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); } }; template struct traits > @@ -608,7 +616,7 @@ template } if(Base::m_diag.size()>0) - dest = Base::m_diag.asDiagonal().inverse() * dest; + dest = Base::m_diag.real().asDiagonal().inverse() * dest; if (Base::m_matrix.nonZeros()>0) // otherwise I==I { diff --git a/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 31e06995..72e1740c 100644 --- a/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/externals/eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -2,46 +2,21 @@ // for linear algebra. // // Copyright (C) 2008-2012 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* - -NOTE: thes functions vave been adapted from the LDL library: +NOTE: these functions have been adapted from the LDL library: LDL Copyright (c) 2005 by Timothy A. Davis. All Rights Reserved. -LDL License: - - Your use or distribution of LDL or any modified version of - LDL implies that you agree to this License. - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - USA - - Permission is hereby granted to use or copy this program under the - terms of the GNU LGPL, provided that the Copyright, this License, - and the Availability of the original version is retained on all copies. - User documentation of any code that uses this code or any modified - version of this code must cite the Copyright, this License, the - Availability note, and "Used by permission." Permission to modify - the code and to distribute modified code is granted, provided the - Copyright, this License, and the Availability note are retained, - and a notice that the code was modified is included. +The author of LDL, Timothy A. Davis., has executed a license with Google LLC +to permit distribution of this code and derivative works as part of Eigen under +the Mozilla Public License v. 2.0, as stated at the top of this file. */ -#include "../Core/util/NonMPL2.h" - #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H @@ -122,7 +97,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& for(StorageIndex k = 0; k < size; ++k) { // compute nonzero pattern of kth row of L, in topological order - y[k] = 0.0; // Y(0:k) is now all zero + y[k] = Scalar(0); // Y(0:k) is now all zero StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L @@ -146,17 +121,17 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* compute numerical values kth row of L (a sparse triangular solve) */ RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset; // get D(k,k), apply the shift function, and clear Y(k) - y[k] = 0.0; + y[k] = Scalar(0); for(; top < size; ++top) { Index i = pattern[top]; /* pattern[top:n-1] is pattern of L(:,k) */ Scalar yi = y[i]; /* get and clear Y(i) */ - y[i] = 0.0; + y[i] = Scalar(0); /* the nonzero entry L(k,i) */ Scalar l_ki; if(DoLDLT) - l_ki = yi / m_diag[i]; + l_ki = yi / numext::real(m_diag[i]); else yi = l_ki = yi / Lx[Lp[i]]; diff --git a/externals/eigen/Eigen/src/SparseCore/AmbiVector.h b/externals/eigen/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f..2cb7747c 100644 --- a/externals/eigen/Eigen/src/SparseCore/AmbiVector.h +++ b/externals/eigen/Eigen/src/SparseCore/AmbiVector.h @@ -28,7 +28,7 @@ class AmbiVector typedef typename NumTraits::Real RealScalar; explicit AmbiVector(Index size) - : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) + : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) { resize(size); } @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } @@ -147,7 +147,8 @@ template void AmbiVector<_Scalar,_StorageIndex>::init(int mode) { m_mode = mode; - if (m_mode==IsSparse) + // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings + // if (m_mode==IsSparse) { m_llSize = 0; m_llStart = -1; diff --git a/externals/eigen/Eigen/src/SparseCore/CompressedStorage.h b/externals/eigen/Eigen/src/SparseCore/CompressedStorage.h index d89fa0da..acd986fa 100644 --- a/externals/eigen/Eigen/src/SparseCore/CompressedStorage.h +++ b/externals/eigen/Eigen/src/SparseCore/CompressedStorage.h @@ -207,6 +207,22 @@ class CompressedStorage return m_values[id]; } + void moveChunk(Index from, Index to, Index chunkSize) + { + eigen_internal_assert(to+chunkSize <= m_size); + if(to>from && from+chunkSize>to) + { + // move backward + internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + else + { + internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to); + internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to); + } + } + void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { Index k = 0; diff --git a/externals/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/externals/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a2..94865025 100644 --- a/externals/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/externals/eigen/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -10,29 +10,31 @@ #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H -namespace Eigen { +namespace Eigen { namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); - + ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); - + std::memset(mask,0,sizeof(bool)*rows); evaluator lhsEval(lhs); evaluator rhsEval(rhs); - + // estimate the number of non zero entries // given a rhs column containing Y non zeros, we assume that the respective Y columns // of the lhs differs in average of one non zeros, thus the number of non zeros for @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -139,7 +141,7 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; typedef SparseMatrix ColMajorMatrixAux; typedef typename sparse_eval::type ColMajorMatrix; - + // If the result is tall and thin (in the extreme case a column vector) // then it is faster to sort the coefficients inplace instead of transposing twice. // FIXME, the following heuristic is probably not very good. @@ -153,7 +155,7 @@ struct conservative_sparse_sparse_product_selector(lhs, rhs, resCol, false); RowMajorMatrix resRow(resCol); res = resRow.markAsRValue(); @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix RowMajorRhs; + typedef SparseMatrix RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); + typedef SparseMatrix RowMajorLhs; + typedef SparseMatrix RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); + typedef SparseMatrix ColMajorLhs; + typedef SparseMatrix ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); + typedef SparseMatrix ColMajorRhs; + typedef SparseMatrix ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); + typedef SparseMatrix ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); + typedef SparseMatrix ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } }; diff --git a/externals/eigen/Eigen/src/SparseCore/SparseAssign.h b/externals/eigen/Eigen/src/SparseCore/SparseAssign.h index 18352a84..905485c8 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseAssign.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::max)(src.rows(),src.cols())*2); + dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); for (Index j=0; j }; // Generic Sparse to Dense assignment -template< typename DstXprType, typename SrcXprType, typename Functor> -struct Assignment +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment { static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { @@ -153,6 +153,73 @@ struct Assignment } }; +// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense +template +struct assignment_from_dense_op_sparse +{ + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN + #endif + + call_assignment_no_alias(dst, src.lhs(), Func1()); + call_assignment_no_alias(dst, src.rhs(), Func2()); + } + + // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse; + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::Shape,DenseShape>::value>::type + run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, + const internal::assign_op& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN + #endif + + // Apply the dense matrix first, then the sparse one. + call_assignment_no_alias(dst, src.rhs(), Func1()); + call_assignment_no_alias(dst, src.lhs(), Func2()); + } + + // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse; + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::Shape,DenseShape>::value>::type + run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, + const internal::assign_op& /*func*/) + { + #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN + EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN + #endif + + // Apply the dense matrix first, then the sparse one. + call_assignment_no_alias(dst, -src.rhs(), Func1()); + call_assignment_no_alias(dst, src.lhs(), add_assign_op()); + } +}; + +#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \ + template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \ + struct Assignment, const Lhs, const Rhs>, internal::ASSIGN_OP, \ + Sparse2Dense, \ + typename internal::enable_if< internal::is_same::Shape,DenseShape>::value \ + || internal::is_same::Shape,DenseShape>::value>::type> \ + : assignment_from_dense_op_sparse, internal::ASSIGN_OP2 > \ + {} + +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op); + +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op); + + // Specialization for "dst = dec.solve(rhs)" // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error template @@ -179,35 +246,22 @@ struct Assignment { typedef typename DstXprType::StorageIndex StorageIndex; typedef typename DstXprType::Scalar Scalar; - typedef Array ArrayXI; - typedef Array ArrayXS; - template - static void run(SparseMatrix &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); - Index size = src.diagonal().size(); - dst.makeCompressed(); - dst.resizeNonZeros(size); - Map(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1); - Map(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size)); - Map(dst.valuePtr(), size) = src.diagonal(); - } + template + static void run(SparseMatrix &dst, const SrcXprType &src, const AssignFunc &func) + { dst.assignDiagonal(src.diagonal(), func); } template static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { - dst.diagonal() = src.diagonal(); - } + { dst.derived().diagonal() = src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) - { dst.diagonal() += src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.derived().diagonal() += src.diagonal(); } - static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) - { dst.diagonal() -= src.diagonal(); } + template + static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.derived().diagonal() -= src.diagonal(); } }; } // end namespace internal diff --git a/externals/eigen/Eigen/src/SparseCore/SparseBlock.h b/externals/eigen/Eigen/src/SparseCore/SparseBlock.h index 511e92b2..5b4f6cc9 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseBlock.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseBlock.h @@ -164,7 +164,7 @@ class sparse_matrix_block_impl } else { - if(m_matrix.isCompressed()) + if(m_matrix.isCompressed() && nnz!=block_size) { // no need to realloc, simply copy the tail at its respective position and insert tmp matrix.data().resize(start + nnz + tail_size); @@ -326,46 +326,6 @@ class BlockImpl,BlockRows,B //---------- -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorReturnType SparseMatrixBase::innerVector(Index outer) -{ return InnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatrixBase::innerVector(Index outer) const -{ return ConstInnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - /** Generic implementation of sparse Block expression. * Real-only. */ @@ -486,9 +446,13 @@ struct unary_evaluator, IteratorBa {} inline Index nonZerosEstimate() const { - Index nnz = m_block.nonZeros(); - if(nnz<0) - return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size(); + const Index nnz = m_block.nonZeros(); + if(nnz < 0) { + // Scale the non-zero estimate for the underlying expression linearly with block size. + // Return zero if the underlying block is empty. + const Index nested_sz = m_block.nestedExpression().size(); + return nested_sz == 0 ? 0 : m_argImpl.nonZerosEstimate() * m_block.size() / nested_sz; + } return nnz; } @@ -503,22 +467,25 @@ template class unary_evaluator, IteratorBased>::InnerVectorInnerIterator : public EvalIterator { - enum { IsRowMajor = unary_evaluator::IsRowMajor }; + // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator + // because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786) + // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor + enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; const XprType& m_block; Index m_end; public: EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer) - : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), + : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), m_block(aEval.m_block), - m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) + m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) { - while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) ) + while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) ) EvalIterator::operator++(); } - inline StorageIndex index() const { return EvalIterator::index() - convert_index(IsRowMajor ? m_block.startCol() : m_block.startRow()); } - inline Index outer() const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); } + inline StorageIndex index() const { return EvalIterator::index() - convert_index(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); } + inline Index outer() const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); } inline Index row() const { return EvalIterator::row() - m_block.startRow(); } inline Index col() const { return EvalIterator::col() - m_block.startCol(); } @@ -528,7 +495,8 @@ class unary_evaluator, IteratorBas template class unary_evaluator, IteratorBased>::OuterVectorInnerIterator { - enum { IsRowMajor = unary_evaluator::IsRowMajor }; + // NOTE see above + enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; const unary_evaluator& m_eval; Index m_outerPos; const Index m_innerIndex; @@ -538,9 +506,9 @@ class unary_evaluator, IteratorBas EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer) : m_eval(aEval), - m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), - m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), - m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), + m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), + m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), + m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), m_it(m_eval.m_argImpl, m_outerPos) { EIGEN_UNUSED_VARIABLE(outer); @@ -551,10 +519,10 @@ class unary_evaluator, IteratorBas ++(*this); } - inline StorageIndex index() const { return convert_index(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } + inline StorageIndex index() const { return convert_index(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } inline Index outer() const { return 0; } - inline Index row() const { return IsRowMajor ? 0 : index(); } - inline Index col() const { return IsRowMajor ? index() : 0; } + inline Index row() const { return XprIsRowMajor ? 0 : index(); } + inline Index col() const { return XprIsRowMajor ? index() : 0; } inline Scalar value() const { return m_it.value(); } inline Scalar& valueRef() { return m_it.valueRef(); } diff --git a/externals/eigen/Eigen/src/SparseCore/SparseCompressedBase.h b/externals/eigen/Eigen/src/SparseCore/SparseCompressedBase.h index 5ccb4665..6a2c7a8c 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseCompressedBase.h @@ -128,6 +128,28 @@ class SparseCompressedBase protected: /** Default constructor. Do nothing. */ SparseCompressedBase() {} + + /** \internal return the index of the coeff at (row,col) or just before if it does not exist. + * This is an analogue of std::lower_bound. + */ + internal::LowerBoundIndex lower_bound(Index row, Index col) const + { + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = this->outerIndexPtr()[outer]; + Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + internal::LowerBoundIndex p; + p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr(); + p.found = (p.valueinnerIndexPtr()[p.value]==inner); + return p; + } + + friend struct internal::evaluator >; + private: template explicit SparseCompressedBase(const SparseCompressedBase&); }; @@ -185,6 +207,14 @@ class SparseCompressedBase::InnerIterator } inline InnerIterator& operator++() { m_id++; return *this; } + inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; } + + inline InnerIterator operator+(Index i) + { + InnerIterator result = *this; + result += i; + return result; + } inline const Scalar& value() const { return m_values[m_id]; } inline Scalar& valueRef() { return const_cast(m_values[m_id]); } @@ -245,6 +275,14 @@ class SparseCompressedBase::ReverseInnerIterator } inline ReverseInnerIterator& operator--() { --m_id; return *this; } + inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; } + + inline ReverseInnerIterator operator-(Index i) + { + ReverseInnerIterator result = *this; + result -= i; + return result; + } inline const Scalar& value() const { return m_values[m_id-1]; } inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } @@ -317,17 +355,8 @@ struct evaluator > Index find(Index row, Index col) const { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = Derived::IsRowMajor ? row : col; - const Index inner = Derived::IsRowMajor ? col : row; - - Index start = m_matrix->outerIndexPtr()[outer]; - Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); - - return ((pinnerIndexPtr()[p]==inner)) ? p : Dynamic; + internal::LowerBoundIndex p = m_matrix->lower_bound(row,col); + return p.found ? p.value : Dynamic; } const Derived *m_matrix; diff --git a/externals/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/externals/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index e315e355..9b0d3f98 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -101,7 +101,7 @@ struct binary_evaluator, IteratorBased, Iterat } else { - m_value = 0; // this is to avoid a compilation warning + m_value = Scalar(0); // this is to avoid a compilation warning m_id = -1; } return *this; @@ -126,7 +126,7 @@ struct binary_evaluator, IteratorBased, Iterat enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -211,9 +211,8 @@ struct binary_evaluator, IndexBased, IteratorB enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit binary_evaluator(const XprType& xpr) @@ -299,9 +298,8 @@ struct binary_evaluator, IteratorBased, IndexB enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit binary_evaluator(const XprType& xpr) @@ -459,7 +457,7 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -532,9 +530,8 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit sparse_conjunction_evaluator(const XprType& xpr) @@ -607,9 +604,8 @@ struct sparse_conjunction_evaluator enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - // Expose storage order of the sparse expression - Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit) + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), + Flags = XprType::Flags }; explicit sparse_conjunction_evaluator(const XprType& xpr) diff --git a/externals/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/externals/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index ea797379..32dac0f7 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -24,7 +24,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -49,6 +49,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: @@ -78,7 +79,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -99,6 +100,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: diff --git a/externals/eigen/Eigen/src/SparseCore/SparseDenseProduct.h b/externals/eigen/Eigen/src/SparseCore/SparseDenseProduct.h index 0547db59..f005a18a 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseDenseProduct.h @@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { - evaluator lhsEval(lhs); + LhsEval lhsEval(lhs); for(Index c=0; c::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - evaluator lhsEval(lhs); - for(Index j=0; j1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) { - typename Res::RowXpr res_j(res.row(j)); - for(LhsInnerIterator it(lhsEval,j); it ;++it) - res_j += (alpha*it.value()) * rhs.row(it.index()); + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) + for(Index i=0; i diff --git a/externals/eigen/Eigen/src/SparseCore/SparseMatrix.h b/externals/eigen/Eigen/src/SparseCore/SparseMatrix.h index 323c2323..616b4a0c 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseMatrix.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseMatrix.h @@ -21,7 +21,7 @@ namespace Eigen { * This class implements a more versatile variants of the common \em compressed row/column storage format. * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index. * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra - * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero + * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero * can be done with limited memory reallocation and copies. * * A call to the function makeCompressed() turns the matrix into the standard \em compressed format @@ -99,6 +99,8 @@ class SparseMatrix typedef SparseCompressedBase Base; using Base::convert_index; friend class SparseVector<_Scalar,0,_StorageIndex>; + template + friend struct internal::Assignment; public: using Base::isCompressed; using Base::nonZeros; @@ -327,7 +329,8 @@ class SparseMatrix m_outerIndex[j] = newOuterIndex[j]; m_innerNonZeros[j] = innerNNZ; } - m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; + if(m_outerSize>0) + m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; m_data.resize(m_outerIndex[m_outerSize]); } @@ -502,8 +505,8 @@ class SparseMatrix m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; } } - - /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */ + + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -576,10 +579,12 @@ class SparseMatrix else if (innerChange < 0) { // Inner size decreased: allocate a new m_innerNonZeros - m_innerNonZeros = static_cast(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex))); + m_innerNonZeros = static_cast(std::malloc((m_outerSize + outerChange) * sizeof(StorageIndex))); if (!m_innerNonZeros) internal::throw_std_bad_alloc(); - for(Index i = 0; i < m_outerSize; i++) + for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++) m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; + for(Index i = m_outerSize; i < m_outerSize + outerChange; i++) + m_innerNonZeros[i] = 0; } // Change the m_innerNonZeros in case of a decrease of inner size @@ -604,9 +609,9 @@ class SparseMatrix m_outerIndex = newOuterIndex; if (outerChange > 0) { - StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; + StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; for(Index i=m_outerSize; i inline SparseMatrix& operator=(const EigenBase& other) { return Base::operator=(other.derived()); } + + template + inline SparseMatrix& operator=(const Product& other); #endif // EIGEN_PARSED_BY_DOXYGEN template @@ -893,7 +901,114 @@ class SparseMatrix Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++; m_data.index(p) = convert_index(inner); - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); + } +protected: + struct IndexPosPair { + IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {} + Index i; + Index p; + }; + + /** \internal assign \a diagXpr to the diagonal of \c *this + * There are different strategies: + * 1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression. + * 2 - otherwise, for each diagonal coeff, + * 2.a - if it already exists, then we update it, + * 2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion. + * 2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector. + * 3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements. + * + * TODO: some piece of code could be isolated and reused for a general in-place update strategy. + * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once), + * then it *might* be better to disable case 2.b since they will have to be copied anyway. + */ + template + void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) + { + Index n = diagXpr.size(); + + const bool overwrite = internal::is_same >::value; + if(overwrite) + { + if((this->rows()!=n) || (this->cols()!=n)) + this->resize(n, n); + } + + if(m_data.size()==0 || overwrite) + { + typedef Array ArrayXI; + this->makeCompressed(); + this->resizeNonZeros(n); + Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); + Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); + Eigen::Map > values = this->coeffs(); + values.setZero(); + internal::call_assignment_no_alias(values, diagXpr, assignFunc); + } + else + { + bool isComp = isCompressed(); + internal::evaluator diaEval(diagXpr); + std::vector newEntries; + + // 1 - try in-place update and record insertion failures + for(Index i = 0; ilower_bound(i,i); + Index p = lb.value; + if(lb.found) + { + // the coeff already exists + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) + { + // non compressed mode with local room for inserting one element + m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); + m_innerNonZeros[i]++; + m_data.value(p) = Scalar(0); + m_data.index(p) = StorageIndex(i); + assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); + } + else + { + // defer insertion + newEntries.push_back(IndexPosPair(i,p)); + } + } + // 2 - insert deferred entries + Index n_entries = Index(newEntries.size()); + if(n_entries>0) + { + Storage newData(m_data.size()+n_entries); + Index prev_p = 0; + Index prev_i = 0; + for(Index k=0; k T; std::vector tripletList; - triplets.reserve(estimation_of_entries); + tripletList.reserve(estimation_of_entries); for(...) { // ... @@ -986,7 +1101,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa * * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather - * be explicitely stored into a std::vector for instance. + * be explicitly stored into a std::vector for instance. */ template template @@ -1232,7 +1347,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca } m_data.index(p) = convert_index(inner); - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } if(m_data.size() != m_data.allocatedSize()) @@ -1274,7 +1389,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& m_innerNonZeros[outer]++; m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } template @@ -1381,7 +1496,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& } m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } namespace internal { diff --git a/externals/eigen/Eigen/src/SparseCore/SparseMatrixBase.h b/externals/eigen/Eigen/src/SparseCore/SparseMatrixBase.h index c6b548f1..229449f0 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseMatrixBase.h @@ -87,6 +87,11 @@ template class SparseMatrixBase * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -350,18 +355,6 @@ template class SparseMatrixBase const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); } - // inner-vector - typedef Block InnerVectorReturnType; - typedef Block ConstInnerVectorReturnType; - InnerVectorReturnType innerVector(Index outer); - const ConstInnerVectorReturnType innerVector(Index outer) const; - - // set of inner-vectors - typedef Block InnerVectorsReturnType; - typedef Block ConstInnerVectorsReturnType; - InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); - const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; - DenseMatrixType toDense() const { return DenseMatrixType(derived()); diff --git a/externals/eigen/Eigen/src/SparseCore/SparseProduct.h b/externals/eigen/Eigen/src/SparseCore/SparseProduct.h index 4cbf6878..af8a7744 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseProduct.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // supress numerical zeros (exact) + * C = (A*B).pruned(); // suppress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode @@ -164,6 +164,18 @@ struct unary_evaluator >, IteratorBased> } // end namespace internal +// sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.) +template +template +SparseMatrix& SparseMatrix::operator=(const Product& src) +{ + // std::cout << "in Assignment : " << DstOptions << "\n"; + SparseMatrix dst(src.rows(),src.cols()); + internal::generic_product_impl::evalTo(dst,src.lhs(),src.rhs()); + this->swap(dst); + return *this; +} + } // end namespace Eigen #endif // EIGEN_SPARSEPRODUCT_H diff --git a/externals/eigen/Eigen/src/SparseCore/SparseRef.h b/externals/eigen/Eigen/src/SparseCore/SparseRef.h index d91f38f9..748f87d6 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseRef.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseRef.h @@ -201,7 +201,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); obj->~TPlainObjectType(); } } @@ -213,7 +213,7 @@ class Ref, Options, StrideType { if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed())) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); @@ -227,14 +227,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - char m_object_bytes[sizeof(TPlainObjectType)]; + typename internal::aligned_storage::type m_storage; bool m_hasCopy; }; @@ -319,7 +319,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); obj->~TPlainObjectType(); } } @@ -335,14 +335,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(m_object_bytes); + TPlainObjectType* obj = reinterpret_cast(&m_storage); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - char m_object_bytes[sizeof(TPlainObjectType)]; + typename internal::aligned_storage::type m_storage; bool m_hasCopy; }; diff --git a/externals/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/externals/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h index 9e39be73..85b00e10 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -47,6 +47,7 @@ template class SparseSelfAdjointView enum { Mode = _Mode, + TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0), RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime }; @@ -141,6 +142,9 @@ template class SparseSelfAdjointView return *this = src.twistedBy(pnull); } + // Since we override the copy-assignment operator, we need to explicitly re-declare the copy-constructor + EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView) + template SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src) { @@ -310,7 +314,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons while (i && i.index() dstT(dst); - internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); + internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); } }; @@ -452,7 +456,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix::type::Scalar Scalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; typedef typename remove_all::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector tempVector(rows); + AmbiVector tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template struct sparse_sparse_product_with_pruning_selector { - typedef typename traits::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; - typedef SparseMatrix ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixLhs; + typedef SparseMatrix RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixRhs; + typedef SparseMatrix RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, rhs, res, tolerance); } diff --git a/externals/eigen/Eigen/src/SparseCore/SparseUtil.h b/externals/eigen/Eigen/src/SparseCore/SparseUtil.h index 74df0d49..ceb93688 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseUtil.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseUtil.h @@ -140,6 +140,14 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS template<> struct glue_shapes { typedef SparseSelfAdjointShape type; }; template<> struct glue_shapes { typedef SparseTriangularShape type; }; +// return type of SparseCompressedBase::lower_bound; +struct LowerBoundIndex { + LowerBoundIndex() : value(-1), found(false) {} + LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {} + Index value; + bool found; +}; + } // end namespace internal /** \ingroup SparseCore_Module diff --git a/externals/eigen/Eigen/src/SparseCore/SparseVector.h b/externals/eigen/Eigen/src/SparseCore/SparseVector.h index 19b0fbc9..05779be6 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseVector.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/externals/eigen/Eigen/src/SparseCore/SparseView.h b/externals/eigen/Eigen/src/SparseCore/SparseView.h index 7c4aea74..92b3d1f7 100644 --- a/externals/eigen/Eigen/src/SparseCore/SparseView.h +++ b/externals/eigen/Eigen/src/SparseCore/SparseView.h @@ -90,6 +90,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator : public EvalIterator { + protected: typedef typename XprType::Scalar Scalar; public: diff --git a/externals/eigen/Eigen/src/SparseLU/SparseLU.h b/externals/eigen/Eigen/src/SparseLU/SparseLU.h index f883ab38..0c8d8939 100644 --- a/externals/eigen/Eigen/src/SparseLU/SparseLU.h +++ b/externals/eigen/Eigen/src/SparseLU/SparseLU.h @@ -18,6 +18,63 @@ template struct SparseLUMatrixLReturnType; template struct SparseLUMatrixUReturnType; +template +class SparseLUTransposeView : public SparseSolverBase > +{ +protected: + typedef SparseSolverBase > APIBase; + using APIBase::m_isInitialized; +public: + typedef typename SparseLUType::Scalar Scalar; + typedef typename SparseLUType::StorageIndex StorageIndex; + typedef typename SparseLUType::MatrixType MatrixType; + typedef typename SparseLUType::OrderingType OrderingType; + + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + SparseLUTransposeView() : m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) { + this->m_sparseLU = view.m_sparseLU; + } + void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} + void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;} + using APIBase::_solve_impl; + template + bool _solve_impl(const MatrixBase &B, MatrixBase &X_base) const + { + Dest& X(X_base.derived()); + eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first"); + EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + + + // this ugly const_cast_derived() helps to detect aliasing when applying the permutations + for(Index j = 0; j < B.cols(); ++j){ + X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j); + } + //Forward substitution with transposed or adjoint of U + m_sparseLU->matrixU().template solveTransposedInPlace(X); + + //Backward substitution with transposed or adjoint of L + m_sparseLU->matrixL().template solveTransposedInPlace(X); + + // Permute back the solution + for (Index j = 0; j < B.cols(); ++j) + X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j); + return true; + } + inline Index rows() const { return m_sparseLU->rows(); } + inline Index cols() const { return m_sparseLU->cols(); } + +private: + SparseLUType *m_sparseLU; + SparseLUTransposeView& operator=(const SparseLUTransposeView&); +}; + + /** \ingroup SparseLU_Module * \class SparseLU * @@ -26,7 +83,7 @@ template struct SparseLUMatrixURetu * This class implements the supernodal LU factorization for general matrices. * It uses the main techniques from the sequential SuperLU package * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real - * and complex arithmetics with single and double precision, depending on the + * and complex arithmetic with single and double precision, depending on the * scalar type of your input matrix. * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. * It benefits directly from the built-in high-performant Eigen BLAS routines. @@ -43,8 +100,8 @@ template struct SparseLUMatrixURetu * Simple example with key steps * \code * VectorXd x(n), b(n); - * SparseMatrix A; - * SparseLU, COLAMDOrdering > solver; + * SparseMatrix A; + * SparseLU, COLAMDOrdering > solver; * // fill A and b; * // Compute the ordering permutation vector from the structural pattern of A * solver.analyzePattern(A); @@ -97,6 +154,7 @@ class SparseLU : public SparseSolverBase >, }; public: + SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1) { initperfvalues(); @@ -128,6 +186,45 @@ class SparseLU : public SparseSolverBase >, //Factorize factorize(matrix); } + + /** \returns an expression of the transposed of the factored matrix. + * + * A typical usage is to solve for the transposed problem A^T x = b: + * \code + * solver.compute(A); + * x = solver.transpose().solve(b); + * \endcode + * + * \sa adjoint(), solve() + */ + const SparseLUTransposeView > transpose() + { + SparseLUTransposeView > transposeView; + transposeView.setSparseLU(this); + transposeView.setIsInitialized(this->m_isInitialized); + return transposeView; + } + + + /** \returns an expression of the adjoint of the factored matrix + * + * A typical usage is to solve for the adjoint problem A' x = b: + * \code + * solver.compute(A); + * x = solver.adjoint().solve(b); + * \endcode + * + * For real scalar types, this function is equivalent to transpose(). + * + * \sa transpose(), solve() + */ + const SparseLUTransposeView > adjoint() + { + SparseLUTransposeView > adjointView; + adjointView.setSparseLU(this); + adjointView.setIsInitialized(this->m_isInitialized); + return adjointView; + } inline Index rows() const { return m_mat.rows(); } inline Index cols() const { return m_mat.cols(); } @@ -193,7 +290,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * @@ -355,6 +452,9 @@ class SparseLU : public SparseSolverBase >, return (m_detPermR * m_detPermC) > 0 ? det : -det; } + Index nnzL() const { return m_nnzL; }; + Index nnzU() const { return m_nnzU; }; + protected: // Functions void initperfvalues() @@ -391,7 +491,6 @@ class SparseLU : public SparseSolverBase >, private: // Disable copy constructor SparseLU (const SparseLU& ); - }; // End class SparseLU @@ -499,11 +598,8 @@ void SparseLU::factorize(const MatrixType& matrix) eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices"); - typedef typename IndexVector::Scalar StorageIndex; - m_isInitialized = true; - // Apply the column permutation computed in analyzepattern() // m_mat = matrix * m_perm_c.inverse(); m_mat = matrix; @@ -587,7 +683,6 @@ void SparseLU::factorize(const MatrixType& matrix) // (a) a relaxed supernode at the bottom of the etree, or // (b) panel_size contiguous columns, defined by the user Index jcol; - IndexVector panel_histo(n); Index pivrow; // Pivotal row number in the original row matrix Index nseg1; // Number of segments in U-column above panel row jcol Index nseg; // Number of segments in each U-column @@ -706,13 +801,19 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator typedef typename MappedSupernodalType::Scalar Scalar; explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace( MatrixBase &X) const { m_mapL.solveInPlace(X); } + template + void solveTransposedInPlace( MatrixBase &X) const + { + m_mapL.template solveTransposedInPlace(X); + } + const MappedSupernodalType& m_mapL; }; @@ -723,8 +824,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL),m_mapU(mapU) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace(MatrixBase &X) const { @@ -747,8 +848,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } else { + // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } @@ -766,6 +868,52 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } } // End For U-solve } + + template void solveTransposedInPlace(MatrixBase &X) const + { + using numext::conj; + Index nrhs = X.cols(); + Index n = X.rows(); + // Forward solve with U + for (Index k = 0; k <= m_mapL.nsuper(); k++) + { + Index fsupc = m_mapL.supToCol()[k]; + Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension + Index nsupc = m_mapL.supToCol()[k+1] - fsupc; + Index luptr = m_mapL.colIndexPtr()[fsupc]; + + for (Index j = 0; j < nrhs; ++j) + { + for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++) + { + typename MatrixUType::InnerIterator it(m_mapU, jcol); + for ( ; it; ++it) + { + Index irow = it.index(); + X(jcol, j) -= X(irow, j) * (Conjugate? conj(it.value()): it.value()); + } + } + } + if (nsupc == 1) + { + for (Index j = 0; j < nrhs; j++) + { + X(fsupc, j) /= (Conjugate? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]); + } + } + else + { + Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); + Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + if(Conjugate) + U = A.adjoint().template triangularView().solve(U); + else + U = A.transpose().template triangularView().solve(U); + } + }// End For U-solve + } + + const MatrixLType& m_mapL; const MatrixUType& m_mapU; }; diff --git a/externals/eigen/Eigen/src/SparseLU/SparseLU_Memory.h b/externals/eigen/Eigen/src/SparseLU/SparseLU_Memory.h index 4dc42e87..349bfd58 100644 --- a/externals/eigen/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/externals/eigen/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accomodate more fill-ins + * Expand the existing storage to accommodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/externals/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/externals/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 721e1883..0be293d1 100644 --- a/externals/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/externals/eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -75,12 +75,12 @@ class MappedSuperNodalMatrix /** * Number of rows */ - Index rows() { return m_row; } + Index rows() const { return m_row; } /** * Number of columns */ - Index cols() { return m_col; } + Index cols() const { return m_col; } /** * Return the array of nonzero values packed by column @@ -156,6 +156,9 @@ class MappedSuperNodalMatrix class InnerIterator; template void solveInPlace( MatrixBase&X) const; + template + void solveTransposedInPlace( MatrixBase&X) const; + @@ -294,6 +297,77 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co } } +template +template +void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase&X) const +{ + using numext::conj; + Index n = int(X.rows()); + Index nrhs = Index(X.cols()); + const Scalar * Lval = valuePtr(); // Nonzero values + Matrix work(n, nrhs); // working vector + work.setZero(); + for (Index k = nsuper(); k >= 0; k--) + { + Index fsupc = supToCol()[k]; // First column of the current supernode + Index istart = rowIndexPtr()[fsupc]; // Pointer index to the subscript of the current column + Index nsupr = rowIndexPtr()[fsupc+1] - istart; // Number of rows in the current supernode + Index nsupc = supToCol()[k+1] - fsupc; // Number of columns in the current supernode + Index nrow = nsupr - nsupc; // Number of rows in the non-diagonal part of the supernode + Index irow; //Current index row + + if (nsupc == 1 ) + { + for (Index j = 0; j < nrhs; j++) + { + InnerIterator it(*this, fsupc); + ++it; // Skip the diagonal element + for (; it; ++it) + { + irow = it.row(); + X(fsupc,j) -= X(irow, j) * (Conjugate?conj(it.value()):it.value()); + } + } + } + else + { + // The supernode has more than one column + Index luptr = colIndexPtr()[fsupc]; + Index lda = colIndexPtr()[fsupc+1] - luptr; + + //Begin Gather + for (Index j = 0; j < nrhs; j++) + { + Index iptr = istart + nsupc; + for (Index i = 0; i < nrow; i++) + { + irow = rowIndex()[iptr]; + work.topRows(nrow)(i,j)= X(irow,j); // Gather operation + iptr++; + } + } + + // Matrix-vector product with transposed submatrix + Map, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); + Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + if(Conjugate) + U = U - A.adjoint() * work.topRows(nrow); + else + U = U - A.transpose() * work.topRows(nrow); + + // Triangular solve (of transposed diagonal block) + new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); + if(Conjugate) + U = A.adjoint().template triangularView().solve(U); + else + U = A.transpose().template triangularView().solve(U); + + } + + } +} + + } // end namespace internal } // end namespace Eigen diff --git a/externals/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h b/externals/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h index c98b30e3..5a2c941b 100644 --- a/externals/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/externals/eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // intialize xprune(jcol-1) + xprune(jcolm1) = istop; // initialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/externals/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/externals/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 95ba7413..e37c2fe0 100644 --- a/externals/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/externals/eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // agressive vectorization and peeling + // aggressive vectorization and peeling for(Index i=0; i + * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing + * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011. + * + * Even though it is qualified as "rank-revealing", this strategy might fail for some + * rank deficient problems. When this class is used to solve linear or least-square problems + * it is thus strongly recommended to check the accuracy of the computed solution. If it + * failed, it usually helps to increase the threshold with setPivotThreshold. + * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). + * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * */ template @@ -196,9 +209,9 @@ class SparseQR : public SparseSolverBase > Index rank = this->rank(); - // Compute Q^T * b; + // Compute Q^* * b; typename Dest::PlainObject y, b; - y = this->matrixQ().transpose() * B; + y = this->matrixQ().adjoint() * B; b = y; // Solve with the triangular matrix R @@ -330,7 +343,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) m_R.resize(m, n); m_Q.resize(m, diagSize); - // Allocate space for nonzero elements : rough estimation + // Allocate space for nonzero elements: rough estimation m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree m_Q.reserve(2*mat.nonZeros()); m_hcoeffs.resize(diagSize); @@ -604,7 +617,7 @@ struct SparseQR_QProduct : ReturnByValue=0; k--) + Index start_k = internal::is_identity::value ? numext::mini(j,diagSize-1) : diagSize-1; + for (Index k = start_k; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); if(tau==Scalar(0)) continue; - tau = tau * m_qr.m_hcoeffs(k); + tau = tau * numext::conj(m_qr.m_hcoeffs(k)); res.col(j) -= tau * m_qr.m_Q.col(k); } } @@ -650,7 +667,7 @@ struct SparseQR_QProduct : ReturnByValue @@ -668,13 +685,14 @@ struct SparseQRMatrixQReturnType : public EigenBase(m_qr,other.derived(),false); } + // To use for operations with the adjoint of Q SparseQRMatrixQTransposeReturnType adjoint() const { return SparseQRMatrixQTransposeReturnType(m_qr); } inline Index rows() const { return m_qr.rows(); } - inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } - // To use for operations with the transpose of Q + inline Index cols() const { return m_qr.rows(); } + // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { return SparseQRMatrixQTransposeReturnType(m_qr); @@ -682,6 +700,7 @@ struct SparseQRMatrixQReturnType : public EigenBase struct SparseQRMatrixQTransposeReturnType { @@ -712,7 +731,7 @@ struct Assignment, internal: typedef typename DstXprType::StorageIndex StorageIndex; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { - typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows()); + typename DstXprType::PlainObject idMat(src.rows(), src.cols()); idMat.setIdentity(); // Sort the sparse householder reflectors if needed const_cast(&src.m_qr)->_sort_matrix_Q(); diff --git a/externals/eigen/Eigen/src/StlSupport/StdDeque.h b/externals/eigen/Eigen/src/StlSupport/StdDeque.h index cf1fedf9..6d47e757 100644 --- a/externals/eigen/Eigen/src/StlSupport/StdDeque.h +++ b/externals/eigen/Eigen/src/StlSupport/StdDeque.h @@ -36,7 +36,7 @@ namespace std \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -98,17 +98,7 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) - // workaround GCC std::deque implementation - void resize(size_type new_size, const value_type& x) - { - if (new_size < deque_base::size()) - deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size); - else - deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); - } #else - // either GCC 4.1 or non-GCC // default implementation which should always work. void resize(size_type new_size, const value_type& x) { diff --git a/externals/eigen/Eigen/src/StlSupport/StdList.h b/externals/eigen/Eigen/src/StlSupport/StdList.h index e1eba498..8ba3fada 100644 --- a/externals/eigen/Eigen/src/StlSupport/StdList.h +++ b/externals/eigen/Eigen/src/StlSupport/StdList.h @@ -35,7 +35,7 @@ namespace std \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ diff --git a/externals/eigen/Eigen/src/StlSupport/StdVector.h b/externals/eigen/Eigen/src/StlSupport/StdVector.h index ec22821d..9fcf19bc 100644 --- a/externals/eigen/Eigen/src/StlSupport/StdVector.h +++ b/externals/eigen/Eigen/src/StlSupport/StdVector.h @@ -36,7 +36,7 @@ namespace std \ vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ diff --git a/externals/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/externals/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h index 50a69f30..d1d3ad7f 100644 --- a/externals/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/externals/eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -217,12 +217,12 @@ struct SluMatrix : SuperMatrix res.setScalarType(); // FIXME the following is not very accurate - if (MatrixType::Flags & Upper) + if (int(MatrixType::Flags) & int(Upper)) res.Mtype = SLU_TRU; - if (MatrixType::Flags & Lower) + if (int(MatrixType::Flags) & int(Lower)) res.Mtype = SLU_TRL; - eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU"); + eigen_assert(((int(MatrixType::Flags) & int(SelfAdjoint))==0) && "SelfAdjoint matrix shape not supported by SuperLU"); return res; } @@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat) template MappedSparseMatrix map_superlu(SluMatrix& sluMat) { - eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR - || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC); + eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR) + || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC)); Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow; @@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -650,9 +650,8 @@ void SuperLU::_solve_impl(const MatrixBase &b, MatrixBase { eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); - const Index size = m_matrix.rows(); const Index rhsCols = b.cols(); - eigen_assert(size==b.rows()); + eigen_assert(m_matrix.rows()==b.rows()); m_sluOptions.Trans = NOTRANS; m_sluOptions.Fact = FACTORED; @@ -974,9 +973,8 @@ void SuperILU::_solve_impl(const MatrixBase &b, MatrixBase wrapper functions: -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) + // Defaults +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_defaults(control); } -inline void umfpack_free_numeric(void **Numeric, double) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_defaults(control); } + +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_defaults(control); } + +// Report info +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) +{ umfpack_di_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) +{ umfpack_zi_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) +{ umfpack_dl_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, SuiteSparse_long) +{ umfpack_zl_report_info(control, info);} + +// Report status +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) +{ umfpack_di_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) +{ umfpack_zi_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) +{ umfpack_dl_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, SuiteSparse_long) +{ umfpack_zl_report_status(control, status);} + +// report control +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) +{ umfpack_di_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) +{ umfpack_zi_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_report_control(control);} + +// Free numeric +inline void umfpack_free_numeric(void **Numeric, double, int) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex) +inline void umfpack_free_numeric(void **Numeric, std::complex, int) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double) +inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) +{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; } + +inline void umfpack_free_numeric(void **Numeric, std::complex, SuiteSparse_long) +{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; } + +// Free symbolic +inline void umfpack_free_symbolic(void **Symbolic, double, int) { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex) +inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } +inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) +{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } + +inline void umfpack_free_symbolic(void **Symbolic, std::complex, SuiteSparse_long) +{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } + +// Symbolic inline int umfpack_symbolic(int n_row,int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic, const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) @@ -48,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); +} + +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); +} +// Numeric inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) @@ -62,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], + void *Symbolic, void **Numeric, + const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) +{ + return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); +} +// solve inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[], void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) @@ -77,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], + double X[], const double B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); +} + +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], + std::complex X[], const std::complex B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); +} + +// Get Lunz inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) { return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); @@ -87,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, double) +{ + return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, std::complex) +{ + return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +// Get Numeric inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric) { @@ -102,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); +} -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], std::complex Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + double& lx0_real = numext::real_ref(Lx[0]); + double& ux0_real = numext::real_ref(Ux[0]); + double& dx0_real = numext::real_ref(Dx[0]); + return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, + Dx?&dx0_real:0,0,do_recip,Rs,Numeric); +} + +// Get Determinant +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { double& mx_real = numext::real_ref(*Mx); return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } +inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); +} + +inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + double& mx_real = numext::real_ref(*Mx); + return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); +} + /** \ingroup UmfPackSupport_Module * \brief A sparse LU factorization and solver based on UmfPack @@ -146,7 +300,7 @@ class UmfPackLU : public SparseSolverBase > typedef Matrix IntRowVectorType; typedef Matrix IntColVectorType; typedef SparseMatrix LUMatrixType; - typedef SparseMatrix UmfpackMatrixType; + typedef SparseMatrix UmfpackMatrixType; typedef Ref UmfpackMatrixRef; enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -156,6 +310,7 @@ class UmfPackLU : public SparseSolverBase > public: typedef Array UmfpackControl; + typedef Array UmfpackInfo; UmfPackLU() : m_dummy(0,0), mp_matrix(m_dummy) @@ -173,8 +328,8 @@ class UmfPackLU : public SparseSolverBase > ~UmfPackLU() { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex()); } inline Index rows() const { return mp_matrix.rows(); } @@ -182,7 +337,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -215,15 +370,15 @@ class UmfPackLU : public SparseSolverBase > return m_q; } - /** Computes the sparse Cholesky decomposition of \a matrix + /** Computes the sparse Cholesky decomposition of \a matrix * Note that the matrix should be column-major, and in compressed format for best performance. * \sa SparseMatrix::makeCompressed(). */ template void compute(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); analyzePattern_impl(); factorize_impl(); @@ -238,9 +393,9 @@ class UmfPackLU : public SparseSolverBase > template void analyzePattern(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); - + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); + grab(matrix.derived()); analyzePattern_impl(); @@ -267,7 +422,7 @@ class UmfPackLU : public SparseSolverBase > { return m_control; } - + /** Provides access to the control settings array used by UmfPack. * * If this array contains NaN's, the default values are used. @@ -278,7 +433,7 @@ class UmfPackLU : public SparseSolverBase > { return m_control; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. @@ -290,13 +445,41 @@ class UmfPackLU : public SparseSolverBase > { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); if(m_numeric) - umfpack_free_numeric(&m_numeric,Scalar()); + umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); - + factorize_impl(); } + /** Prints the current UmfPack control settings. + * + * \sa umfpackControl() + */ + void printUmfpackControl() + { + umfpack_report_control(m_control.data(), Scalar(),StorageIndex()); + } + + /** Prints statistics collected by UmfPack. + * + * \sa analyzePattern(), compute() + */ + void printUmfpackInfo() + { + eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex()); + } + + /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). + * + * \sa analyzePattern(), compute() + */ + void printUmfpackStatus() { + eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex()); + } + /** \internal */ template bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; @@ -314,41 +497,42 @@ class UmfPackLU : public SparseSolverBase > m_numeric = 0; m_symbolic = 0; m_extractedDataAreDirty = true; + + umfpack_defaults(m_control.data(), Scalar(),StorageIndex()); } - + void analyzePattern_impl() { - umfpack_defaults(m_control.data(), Scalar()); - int errorCode = 0; - errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - &m_symbolic, m_control.data(), 0); + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + &m_symbolic, m_control.data(), m_umfpackInfo.data()); m_isInitialized = true; - m_info = errorCode ? InvalidInput : Success; + m_info = m_fact_errorCode ? InvalidInput : Success; m_analysisIsOk = true; m_factorizationIsOk = false; m_extractedDataAreDirty = true; } - + void factorize_impl() { + m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - m_symbolic, &m_numeric, m_control.data(), 0); + m_symbolic, &m_numeric, m_control.data(), m_umfpackInfo.data()); m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue; m_factorizationIsOk = true; m_extractedDataAreDirty = true; } - + template void grab(const EigenBase &A) { mp_matrix.~UmfpackMatrixRef(); ::new (&mp_matrix) UmfpackMatrixRef(A.derived()); } - + void grab(const UmfpackMatrixRef &A) { if(&(A.derived()) != &mp_matrix) @@ -357,19 +541,20 @@ class UmfPackLU : public SparseSolverBase > ::new (&mp_matrix) UmfpackMatrixRef(A); } } - + // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; - int m_fact_errorCode; + StorageIndex m_fact_errorCode; UmfpackControl m_control; - + mutable UmfpackInfo m_umfpackInfo; + mutable LUMatrixType m_u; mutable IntColVectorType m_p; mutable IntRowVectorType m_q; UmfpackMatrixType m_dummy; UmfpackMatrixRef mp_matrix; - + void* m_numeric; void* m_symbolic; @@ -377,7 +562,7 @@ class UmfPackLU : public SparseSolverBase > int m_factorizationIsOk; int m_analysisIsOk; mutable bool m_extractedDataAreDirty; - + private: UmfPackLU(const UmfPackLU& ) { } }; @@ -389,7 +574,7 @@ void UmfPackLU::extractData() const if (m_extractedDataAreDirty) { // get size of the data - int lnz, unz, rows, cols, nz_udiag; + StorageIndex lnz, unz, rows, cols, nz_udiag; umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); // allocate data @@ -415,7 +600,7 @@ template typename UmfPackLU::Scalar UmfPackLU::determinant() const { Scalar det; - umfpack_get_determinant(&det, 0, m_numeric, 0); + umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex()); return det; } @@ -427,8 +612,7 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet"); eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); - - int errorCode; + Scalar* x_ptr = 0; Matrix x_tmp; if(x.innerStride()!=1) @@ -440,9 +624,10 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas { if(x.innerStride()==1) x_ptr = &x.col(j).coeffRef(0); - errorCode = umfpack_solve(UMFPACK_A, - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), 0); + StorageIndex errorCode = umfpack_solve(UMFPACK_A, + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), + m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/externals/eigen/Eigen/src/misc/lapacke.h b/externals/eigen/Eigen/src/misc/lapacke.h index 8c7e79b0..3d8e24f5 100644 --- a/externals/eigen/Eigen/src/misc/lapacke.h +++ b/externals/eigen/Eigen/src/misc/lapacke.h @@ -43,10 +43,6 @@ #include "lapacke_config.h" #endif -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #include #ifndef lapack_int @@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif diff --git a/externals/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/externals/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 1f8a531a..0e5d5445 100644 --- a/externals/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/externals/eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -75,6 +75,32 @@ max return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } +/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other + * + * Example: \include Cwise_absolute_difference.cpp + * Output: \verbinclude Cwise_absolute_difference.out + * + * \sa absolute_difference() + */ +EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference) + +/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other + * + * \sa absolute_difference() + */ +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +absolute_difference +#else +(absolute_difference) +#endif +(const Scalar &other) const +{ + return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents. * * This function computes the coefficient-wise power. @@ -119,7 +145,7 @@ OP(const Scalar& s) const { \ return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \ } \ EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \ -OP(const Scalar& s, const Derived& d) { \ +OP(const Scalar& s, const EIGEN_CURRENT_STORAGE_BASE_CLASS& d) { \ return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \ } @@ -314,9 +340,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS &n) const * * It returns the Riemann zeta function of two arguments \c *this and \a q: * - * \param *this is the exposent, it must be > 1 * \param q is the shift, it must be > 0 * + * \note *this is the exponent, it must be > 1. * \note This function supports only float and double scalar types. To support other scalar types, the user has * to provide implementations of zeta(T,T) for any scalar type T to be supported. * diff --git a/externals/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/externals/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h index ebaa3f19..13c55f4b 100644 --- a/externals/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/externals/eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -10,9 +10,11 @@ typedef CwiseUnaryOp, const Derived> Inverse typedef CwiseUnaryOp, const Derived> BooleanNotReturnType; typedef CwiseUnaryOp, const Derived> ExpReturnType; +typedef CwiseUnaryOp, const Derived> Expm1ReturnType; typedef CwiseUnaryOp, const Derived> LogReturnType; typedef CwiseUnaryOp, const Derived> Log1pReturnType; typedef CwiseUnaryOp, const Derived> Log10ReturnType; +typedef CwiseUnaryOp, const Derived> Log2ReturnType; typedef CwiseUnaryOp, const Derived> CosReturnType; typedef CwiseUnaryOp, const Derived> SinReturnType; typedef CwiseUnaryOp, const Derived> TanReturnType; @@ -20,11 +22,18 @@ typedef CwiseUnaryOp, const Derived> AcosReturn typedef CwiseUnaryOp, const Derived> AsinReturnType; typedef CwiseUnaryOp, const Derived> AtanReturnType; typedef CwiseUnaryOp, const Derived> TanhReturnType; +typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; +#if EIGEN_HAS_CXX11_MATH +typedef CwiseUnaryOp, const Derived> AtanhReturnType; +typedef CwiseUnaryOp, const Derived> AsinhReturnType; +typedef CwiseUnaryOp, const Derived> AcoshReturnType; +#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; typedef CwiseUnaryOp, const Derived> RoundReturnType; +typedef CwiseUnaryOp, const Derived> RintReturnType; typedef CwiseUnaryOp, const Derived> FloorReturnType; typedef CwiseUnaryOp, const Derived> CeilReturnType; typedef CwiseUnaryOp, const Derived> IsNaNReturnType; @@ -90,6 +99,20 @@ exp() const return ExpReturnType(derived()); } +/** \returns an expression of the coefficient-wise exponential of *this minus 1. + * + * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1, + * however, with finite precision, this function is much more accurate when \c x is close to zero. + * + * \sa Math functions, exp() + */ +EIGEN_DEVICE_FUNC +inline const Expm1ReturnType +expm1() const +{ + return Expm1ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise logarithm of *this. * * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the @@ -98,7 +121,7 @@ exp() const * Example: \include Cwise_log.cpp * Output: \verbinclude Cwise_log.out * - * \sa Math functions, exp() + * \sa Math functions, log() */ EIGEN_DEVICE_FUNC inline const LogReturnType @@ -137,6 +160,18 @@ log10() const return Log10ReturnType(derived()); } +/** \returns an expression of the coefficient-wise base-2 logarithm of *this. + * + * This function computes the coefficient-wise base-2 logarithm. + * + */ +EIGEN_DEVICE_FUNC +inline const Log2ReturnType +log2() const +{ + return Log2ReturnType(derived()); +} + /** \returns an expression of the coefficient-wise square root of *this. * * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the @@ -311,7 +346,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa Math functions, tan(), sinh(), cosh() + * \sa Math functions, tanh(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -320,6 +355,50 @@ cosh() const return CoshReturnType(derived()); } +#if EIGEN_HAS_CXX11_MATH +/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AtanhReturnType +atanh() const +{ + return AtanhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AsinhReturnType +asinh() const +{ + return AsinhReturnType(derived()); +} + +/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. + * + * \sa Math functions, atanh(), asinh(), acosh() + */ +EIGEN_DEVICE_FUNC +inline const AcoshReturnType +acosh() const +{ + return AcoshReturnType(derived()); +} +#endif + +/** \returns an expression of the coefficient-wise logistic of *this. + */ +EIGEN_DEVICE_FUNC +inline const LogisticReturnType +logistic() const +{ + return LogisticReturnType(derived()); +} + /** \returns an expression of the coefficient-wise inverse of *this. * * Example: \include Cwise_inverse.cpp @@ -362,6 +441,20 @@ cube() const return CubeReturnType(derived()); } +/** \returns an expression of the coefficient-wise rint of *this. + * + * Example: \include Cwise_rint.cpp + * Output: \verbinclude Cwise_rint.out + * + * \sa Math functions, ceil(), floor() + */ +EIGEN_DEVICE_FUNC +inline const RintReturnType +rint() const +{ + return RintReturnType(derived()); +} + /** \returns an expression of the coefficient-wise round of *this. * * Example: \include Cwise_round.cpp @@ -404,6 +497,45 @@ ceil() const return CeilReturnType(derived()); } +template struct ShiftRightXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type arithmetically + * shifted right by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftLeft() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftRightXpr::Type +shiftRight() const +{ + return typename ShiftRightXpr::Type(derived()); +} + + +template struct ShiftLeftXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type logically + * shifted left by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftRight() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftLeftXpr::Type +shiftLeft() const +{ + return typename ShiftLeftXpr::Type(derived()); +} + /** \returns an expression of the coefficient-wise isnan of *this. * * Example: \include Cwise_isNaN.cpp @@ -471,14 +603,12 @@ typedef CwiseUnaryOp, const Derived> LgammaRe typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; +typedef CwiseUnaryOp, const Derived> NdtriReturnType; /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|). * * \specialfunctions_module * - * Example: \include Cwise_lgamma.cpp - * Output: \verbinclude Cwise_lgamma.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar * type T to be supported. @@ -514,9 +644,6 @@ digamma() const * * \specialfunctions_module * - * Example: \include Cwise_erf.cpp - * Output: \verbinclude Cwise_erf.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar * type T to be supported. @@ -535,9 +662,6 @@ erf() const * * \specialfunctions_module * - * Example: \include Cwise_erfc.cpp - * Output: \verbinclude Cwise_erfc.out - * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar * type T to be supported. @@ -550,3 +674,23 @@ erfc() const { return ErfcReturnType(derived()); } + +/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function + * function of *this. + * + * \specialfunctions_module + * + * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the + * Gaussian probability density function (integrated from minus infinity to x) is equal to y. + * + * \note This function supports only float and double scalar types. To support other scalar types, + * the user has to provide implementations of ndtri(T) for any scalar type T to be supported. + * + * \sa Math functions + */ +EIGEN_DEVICE_FUNC +inline const NdtriReturnType +ndtri() const +{ + return NdtriReturnType(derived()); +} diff --git a/externals/eigen/Eigen/src/plugins/BlockMethods.h b/externals/eigen/Eigen/src/plugins/BlockMethods.h index ac35a008..63a52a6f 100644 --- a/externals/eigen/Eigen/src/plugins/BlockMethods.h +++ b/externals/eigen/Eigen/src/plugins/BlockMethods.h @@ -40,68 +40,126 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +/// \internal inner-vector +typedef Block InnerVectorReturnType; +typedef Block ConstInnerVectorReturnType; + +/// \internal set of inner-vectors +typedef Block InnerVectorsReturnType; +typedef Block ConstInnerVectorsReturnType; + #endif // not EIGEN_PARSED_BY_DOXYGEN -/// \returns a dynamic-size expression of a block in *this. +/// \returns an expression of a block in \c *this with either dynamic or fixed sizes. /// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows the number of rows in the block -/// \param blockCols the number of columns in the block +/// \param startRow the first row in the block +/// \param startCol the first column in the block +/// \param blockRows number of rows in the block, specified at either run-time or compile-time +/// \param blockCols number of columns in the block, specified at either run-time or compile-time +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_block_int_int_int_int.cpp +/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// \newin{3.4}: +/// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols: +/// \code +/// mat.block(i,j,fix,cols) +/// \endcode +/// +/// This function thus fully covers the features offered by the following overloads block(Index, Index), +/// and block(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids +/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code. +/// +/// but with less redundancy and more consistency as it does not modify the argument order +/// and seamlessly enable hybrid fixed/dynamic sizes. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index) +/// \sa class Block, fix, fix(int) /// -EIGEN_DEVICE_FUNC -inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) { - return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } -/// This is the const version of block(Index,Index,Index,Index). */ -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const +/// This is the const version of block(Index,Index,NRowsType,NColsType) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const { - return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type( + derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); } - -/// \returns a dynamic-size expression of a top-right corner of *this. +/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example: \include MatrixBase_topRightCorner_int_int.cpp +/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of topRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const +/// This is the const version of topRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-right corner of *this. +/// \returns an expression of a fixed-size top-right corner of \c *this. /// /// \tparam CRows the number of rows in the corner /// \tparam CCols the number of columns in the corner @@ -114,21 +172,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner() { return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /// This is the const version of topRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } -/// \returns an expression of a top-right corner of *this. +/// \returns an expression of a top-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -148,46 +206,67 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /// This is the const version of topRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_topLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr topLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), 0, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } /// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +topLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), 0, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size top-left corner of *this. +/// \returns an expression of a fixed-size top-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -196,24 +275,24 @@ inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type topLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner() { return typename FixedBlockXpr::Type(derived(), 0, 0); } /// This is the const version of topLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type topLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } -/// \returns an expression of a top-left corner of *this. +/// \returns an expression of a top-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -233,46 +312,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomRightCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomRightCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const +/// This is the const version of bottomRightCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +const typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomRightCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-right corner of *this. +/// \returns an expression of a fixed-size bottom-right corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -281,24 +383,24 @@ inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomRightCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /// This is the const version of bottomRightCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } -/// \returns an expression of a bottom-right corner of *this. +/// \returns an expression of a bottom-right corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -318,46 +420,69 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /// This is the const version of bottomRightCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// \returns a dynamic-size expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this with either dynamic or fixed sizes. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out /// +/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename FixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) { - return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// This is the const version of bottomLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC -inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const +/// This is the const version of bottomLeftCorner(NRowsType, NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type +#else +typename ConstFixedBlockXpr<...,...>::Type +#endif +bottomLeftCorner(NRowsType cRows, NColsType cCols) const { - return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); + return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type + (derived(), rows() - internal::get_runtime_value(cRows), 0, + internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); } -/// \returns an expression of a fixed-size bottom-left corner of *this. +/// \returns an expression of a fixed-size bottom-left corner of \c *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -366,24 +491,24 @@ inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type bottomLeftCorner() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /// This is the const version of bottomLeftCorner(). template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } -/// \returns an expression of a bottom-left corner of *this. +/// \returns an expression of a bottom-left corner of \c *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -403,45 +528,66 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /// This is the const version of bottomLeftCorner(Index, Index). template -inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_topRows_int.cpp /// Output: \verbinclude MatrixBase_topRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr topRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) { - return RowsBlockXpr(derived(), 0, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of topRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr topRows(Index n) const +/// This is the const version of topRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +topRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), 0, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the top rows of *this. +/// \returns a block consisting of the top rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -454,50 +600,69 @@ inline ConstRowsBlockXpr topRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type topRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type topRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } /// This is the const version of topRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type topRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_bottomRows_int.cpp /// Output: \verbinclude MatrixBase_bottomRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr bottomRows(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) { - return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of bottomRows(Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr bottomRows(Index n) const +/// This is the const version of bottomRows(NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +bottomRows(NRowsType n) const { - return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of the bottom rows of *this. +/// \returns a block consisting of the bottom rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -510,51 +675,70 @@ inline ConstRowsBlockXpr bottomRows(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type bottomRows(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type bottomRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } /// This is the const version of bottomRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \param startRow the index of the first row in the block /// \param n the number of rows in the block +/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include DenseBase_middleRows_int.cpp /// Output: \verbinclude DenseBase_middleRows_int.out /// +/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline RowsBlockXpr middleRows(Index startRow, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NRowsBlockXpr::value>::Type +#else +typename NRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) { - return RowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename NRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// This is the const version of middleRows(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const +/// This is the const version of middleRows(Index,NRowsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNRowsBlockXpr::value>::Type +#else +const typename ConstNRowsBlockXpr<...>::Type +#endif +middleRows(Index startRow, NRowsType n) const { - return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); + return typename ConstNRowsBlockXpr::value>::Type + (derived(), startRow, 0, internal::get_runtime_value(n), cols()); } -/// \returns a block consisting of a range of rows of *this. +/// \returns a block consisting of a range of rows of \c *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param startRow the index of the first row in the block @@ -568,50 +752,69 @@ inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) { return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } /// This is the const version of middleRows(). template -EIGEN_DEVICE_FUNC -inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_leftCols_int.cpp /// Output: \verbinclude MatrixBase_leftCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr leftCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) { - return ColsBlockXpr(derived(), 0, 0, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// This is the const version of leftCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr leftCols(Index n) const +/// This is the const version of leftCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +leftCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, 0, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, 0, rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the left columns of *this. +/// \returns a block consisting of the left columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -624,50 +827,69 @@ inline ConstColsBlockXpr leftCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type leftCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type leftCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); } /// This is the const version of leftCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type leftCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \param n the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_rightCols_int.cpp /// Output: \verbinclude MatrixBase_rightCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr rightCols(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) { - return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename NColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// This is the const version of rightCols(Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr rightCols(Index n) const +/// This is the const version of rightCols(NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +rightCols(NColsType n) const { - return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); } -/// \returns a block consisting of the right columns of *this. +/// \returns a block consisting of the right columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -680,51 +902,70 @@ inline ConstColsBlockXpr rightCols(Index n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type rightCols(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type rightCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } /// This is the const version of rightCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type rightCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \param startCol the index of the first column in the block /// \param numCols the number of columns in the block +/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include DenseBase_middleCols_int.cpp /// Output: \verbinclude DenseBase_middleCols_int.out /// +/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC -inline ColsBlockXpr middleCols(Index startCol, Index numCols) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename NColsBlockXpr::value>::Type +#else +typename NColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) { - return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename NColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// This is the const version of middleCols(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const +/// This is the const version of middleCols(Index,NColsType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstNColsBlockXpr::value>::Type +#else +const typename ConstNColsBlockXpr<...>::Type +#endif +middleCols(Index startCol, NColsType numCols) const { - return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); + return typename ConstNColsBlockXpr::value>::Type + (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); } -/// \returns a block consisting of a range of columns of *this. +/// \returns a block consisting of a range of columns of \c *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param startCol the index of the first column in the block @@ -738,26 +979,26 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } /// This is the const version of middleCols(). template -EIGEN_DEVICE_FUNC -inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } -/// \returns a fixed-size expression of a block in *this. +/// \returns a fixed-size expression of a block of \c *this. /// /// The template parameters \a NRows and \a NCols are the number of /// rows and columns in the block. @@ -768,29 +1009,35 @@ inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = /// Example: \include MatrixBase_block_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int.out /// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence: +/// \code +/// mat.template block(i,j) <--> mat.block(i,j,fix,fix) +/// \endcode +/// /// \note since block is a templated member, the keyword template has to be used /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol) { return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /// This is the const version of block<>(Index, Index). */ template -EIGEN_DEVICE_FUNC -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } -/// \returns an expression of a block in *this. +/// \returns an expression of a block of \c *this. /// /// \tparam NRows number of rows in block as specified at compile-time /// \tparam NCols number of columns in block as specified at compile-time @@ -805,14 +1052,25 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out +/// +/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic +/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: +/// \code +/// mat.template block(i,j,rows,cols) <--> mat.block(i,j,fix(rows),fix(cols)) +/// \endcode +/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes: +/// \code +/// mat.template block(i,j,rows,NCols) <--> mat.block(i,j,rows,fix) +/// \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, block(Index,Index,Index,Index) +/// \sa block(Index,Index,NRowsType,NColsType), class Block /// template -inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -820,13 +1078,14 @@ inline typename FixedBlockXpr::Type block(Index startRow, Index sta /// This is the const version of block<>(Index, Index, Index, Index). template -inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } -/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_col.cpp /// Output: \verbinclude MatrixBase_col.out @@ -834,20 +1093,20 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /** * \sa row(), class Block */ -EIGEN_DEVICE_FUNC -inline ColXpr col(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ColXpr col(Index i) { return ColXpr(derived(), i); } /// This is the const version of col(). -EIGEN_DEVICE_FUNC -inline ConstColXpr col(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); } -/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_row.cpp /// Output: \verbinclude MatrixBase_row.out @@ -855,109 +1114,166 @@ inline ConstColXpr col(Index i) const EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /** * \sa col(), class Block */ -EIGEN_DEVICE_FUNC -inline RowXpr row(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RowXpr row(Index i) { return RowXpr(derived(), i); } /// This is the const version of row(). */ -EIGEN_DEVICE_FUNC -inline ConstRowXpr row(Index i) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); } -/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this. +/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param start the first coefficient in the segment /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_segment_int_int.cpp /// Output: \verbinclude MatrixBase_segment_int_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// -/// \sa class Block, segment(Index) +/// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType segment(Index start, Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), start, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// This is the const version of segment(Index,Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType segment(Index start, Index n) const +/// This is the const version of segment(Index,NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +segment(Index start, NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), start, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), start, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the first coefficients of *this. +/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_start_int.cpp /// Output: \verbinclude MatrixBase_start_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType head(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +head(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), 0, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// This is the const version of head(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType head(Index n) const +/// This is the const version of head(NType). +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +head(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), 0, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), 0, internal::get_runtime_value(n)); } -/// \returns a dynamic-size expression of the last coefficients of *this. +/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment +/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_end_int.cpp /// Output: \verbinclude MatrixBase_end_int.out /// -/// \note Even though the returned expression has dynamic size, in the case +/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. +/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. +/// +/// \note Even in the case that the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC -inline SegmentReturnType tail(Index n) +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +typename FixedSegmentReturnType::value>::Type +#else +typename FixedSegmentReturnType<...>::Type +#endif +tail(NType n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return SegmentReturnType(derived(), this->size() - n, n); + return typename FixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC -inline ConstSegmentReturnType tail(Index n) const +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +#ifndef EIGEN_PARSED_BY_DOXYGEN +const typename ConstFixedSegmentReturnType::value>::Type +#else +const typename ConstFixedSegmentReturnType<...>::Type +#endif +tail(NType n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return ConstSegmentReturnType(derived(), this->size() - n, n); + return typename ConstFixedSegmentReturnType::value>::Type + (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); } /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this @@ -974,11 +1290,11 @@ inline ConstSegmentReturnType tail(Index n) const /// Example: \include MatrixBase_template_int_segment.cpp /// Output: \verbinclude MatrixBase_template_int_segment.out /// -/// \sa class Block +/// \sa segment(Index,NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type segment(Index start, Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), start, n); @@ -986,14 +1302,14 @@ inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N /// This is the const version of segment(Index). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), start, n); } -/// \returns a fixed-size expression of the first coefficients of *this. +/// \returns a fixed-size expression of the first coefficients of \c *this. /// /// \only_for_vectors /// @@ -1006,11 +1322,11 @@ inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index /// Example: \include MatrixBase_template_int_start.cpp /// Output: \verbinclude MatrixBase_template_int_start.out /// -/// \sa class Block +/// \sa head(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type head(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type head(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), 0, n); @@ -1018,14 +1334,14 @@ inline typename FixedSegmentReturnType::Type head(Index n = N) /// This is the const version of head(). template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type head(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); } -/// \returns a fixed-size expression of the last coefficients of *this. +/// \returns a fixed-size expression of the last coefficients of \c *this. /// /// \only_for_vectors /// @@ -1038,11 +1354,11 @@ inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const /// Example: \include MatrixBase_template_int_end.cpp /// Output: \verbinclude MatrixBase_template_int_end.out /// -/// \sa class Block +/// \sa tail(NType), class Block /// template -EIGEN_DEVICE_FUNC -inline typename FixedSegmentReturnType::Type tail(Index n = N) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename FixedSegmentReturnType::Type tail(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), size() - n); @@ -1050,9 +1366,77 @@ inline typename FixedSegmentReturnType::Type tail(Index n = N) /// This is the const version of tail. template -EIGEN_DEVICE_FUNC -inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename ConstFixedSegmentReturnType::Type tail(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorReturnType innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorReturnType innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +InnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +const ConstInnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/** \returns the i-th subvector (column or vector) according to the \c Direction + * \sa subVectors() + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) +{ + return typename internal::conditional::type(derived(),i); +} + +/** This is the const version of subVector(Index) */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +typename internal::conditional::type +subVector(Index i) const +{ + return typename internal::conditional::type(derived(),i); +} + +/** \returns the number of subvectors (rows or columns) in the direction \c Direction + * \sa subVector(Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR +Index subVectors() const +{ return (Direction==Vertical)?cols():rows(); } diff --git a/externals/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h b/externals/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h index 89f4faaa..5418dc41 100644 --- a/externals/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/externals/eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -76,6 +76,20 @@ conjugate() const return ConjugateReturnType(derived()); } +/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. +/// +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +/// +/// \sa conjugate() +template +EIGEN_DEVICE_FUNC +inline typename internal::conditional::type +conjugateIf() const +{ + typedef typename internal::conditional::type ReturnType; + return ReturnType(derived()); +} + /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) diff --git a/externals/eigen/Eigen/src/plugins/IndexedViewMethods.h b/externals/eigen/Eigen/src/plugins/IndexedViewMethods.h new file mode 100644 index 00000000..5bfb19ac --- /dev/null +++ b/externals/eigen/Eigen/src/plugins/IndexedViewMethods.h @@ -0,0 +1,262 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if !defined(EIGEN_PARSED_BY_DOXYGEN) + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_CONST const +#define EIGEN_INDEXED_VIEW_METHOD_TYPE ConstIndexedViewType +#else +#define EIGEN_INDEXED_VIEW_METHOD_CONST +#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType +#endif + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +protected: + +// define some aliases to ease readability + +template +struct IvcRowType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcColType : public internal::IndexedViewCompatibleType {}; + +template +struct IvcType : public internal::IndexedViewCompatibleType {}; + +typedef typename internal::IndexedViewCompatibleType::type IvcIndex; + +template +typename IvcRowType::type +ivcRow(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().rows()),Specialized); +} + +template +typename IvcColType::type +ivcCol(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().cols()),Specialized); +} + +template +typename IvcColType::type +ivcSize(const Indices& indices) const { + return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); +} + +public: + +#endif + +template +struct EIGEN_INDEXED_VIEW_METHOD_TYPE { + typedef IndexedView::type, + typename IvcColType::type> type; +}; + +// This is the generic version + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsIndexedView, + typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type + (derived(), ivcRow(rowIndices), ivcCol(colIndices)); +} + +// The following overload returns a Block<> object + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsBlock, + typename internal::traits::type>::BlockType>::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + typedef typename internal::traits::type>::BlockType BlockType; + typename IvcRowType::type actualRowIndices = ivcRow(rowIndices); + typename IvcColType::type actualColIndices = ivcCol(colIndices); + return BlockType(derived(), + internal::first(actualRowIndices), + internal::first(actualColIndices), + internal::size(actualRowIndices), + internal::size(actualColIndices)); +} + +// The following overload returns a Scalar + +template +typename internal::enable_if::value + && internal::traits::type>::ReturnAsScalar, + CoeffReturnType >::type +operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// The following three overloads are needed to handle raw Index[N] arrays. + +template +IndexedView::type> +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type> + (derived(), rowIndices, ivcCol(colIndices)); +} + +template +IndexedView::type, const ColIndicesT (&)[ColIndicesN]> +operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView::type,const ColIndicesT (&)[ColIndicesN]> + (derived(), ivcRow(rowIndices), colIndices); +} + +template +IndexedView +operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return IndexedView + (derived(), rowIndices, colIndices); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +// Overloads for 1D vectors/arrays + +template +typename internal::enable_if< + IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type> + (derived(), IvcIndex(0), ivcCol(indices)); +} + +template +typename internal::enable_if< + (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), + IndexedView::type,IvcIndex> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView::type,IvcIndex> + (derived(), ivcRow(indices), IvcIndex(0)); +} + +template +typename internal::enable_if< + (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), + VectorBlock::value> >::type +operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + typename IvcType::type actualIndices = ivcSize(indices); + return VectorBlock::value> + (derived(), internal::first(actualIndices), internal::size(actualIndices)); +} + +template +typename internal::enable_if::value, CoeffReturnType >::type +operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + return Base::operator()(internal::eval_expr_given_size(id,size())); +} + +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), IvcIndex(0), indices); +} + +template +typename internal::enable_if >::type +operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return IndexedView + (derived(), indices, IvcIndex(0)); +} + +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + +#undef EIGEN_INDEXED_VIEW_METHOD_CONST +#undef EIGEN_INDEXED_VIEW_METHOD_TYPE + +#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#include "IndexedViewMethods.h" +#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS +#endif + +#else // EIGEN_PARSED_BY_DOXYGEN + +/** + * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively. + * + * Each parameter must either be: + * - An integer indexing a single row or column + * - Eigen::all indexing the full set of respective rows or columns in increasing order + * - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions + * - Any %Eigen's vector/array of integers or expressions + * - Plain C arrays: \c int[N] + * - And more generally any type exposing the following two member functions: + * \code + * operator[]() const; + * size() const; + * \endcode + * where \c stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t). + * + * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc. + * + * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this + * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case + * when all arguments are either: + * - An integer + * - Eigen::all + * - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N). + * + * Otherwise a more general IndexedView object will be returned, after conversion of the inputs + * to more suitable types \c RowIndices' and \c ColIndices'. + * + * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. + * + * See also this question and its answer for an example of how to duplicate coefficients. + * + * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) + */ +template +IndexedView_or_Block +operator()(const RowIndices& rowIndices, const ColIndices& colIndices); + +/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays + * + * \only_for_vectors + */ +template +IndexedView_or_VectorBlock +operator()(const Indices& indices); + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/externals/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/externals/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h index f1084abe..a0feef87 100644 --- a/externals/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/externals/eigen/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise != operator of *this and \a other @@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and \a other diff --git a/externals/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/externals/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h index b1be3d56..0514d8f7 100644 --- a/externals/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h +++ b/externals/eigen/Eigen/src/plugins/MatrixCwiseUnaryOps.h @@ -14,6 +14,7 @@ typedef CwiseUnaryOp, const Derived> CwiseAbsReturnType; typedef CwiseUnaryOp, const Derived> CwiseAbs2ReturnType; +typedef CwiseUnaryOp, const Derived> CwiseArgReturnType; typedef CwiseUnaryOp, const Derived> CwiseSqrtReturnType; typedef CwiseUnaryOp, const Derived> CwiseSignReturnType; typedef CwiseUnaryOp, const Derived> CwiseInverseReturnType; @@ -82,4 +83,13 @@ EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); } +/// \returns an expression of the coefficient-wise phase angle of \c *this +/// +/// Example: \include MatrixBase_cwiseArg.cpp +/// Output: \verbinclude MatrixBase_cwiseArg.out +/// +EIGEN_DOC_UNARY_ADDONS(cwiseArg,arg) +EIGEN_DEVICE_FUNC +inline const CwiseArgReturnType +cwiseArg() const { return CwiseArgReturnType(derived()); } diff --git a/externals/eigen/Eigen/src/plugins/ReshapedMethods.h b/externals/eigen/Eigen/src/plugins/ReshapedMethods.h new file mode 100644 index 00000000..482a6b04 --- /dev/null +++ b/externals/eigen/Eigen/src/plugins/ReshapedMethods.h @@ -0,0 +1,149 @@ + +#ifdef EIGEN_PARSED_BY_DOXYGEN + +/// \returns an expression of \c *this with reshaped sizes. +/// +/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// \tparam NRowsType the type of the value handling the number of rows, typically Index. +/// \tparam NColsType the type of the value handling the number of columns, typically Index. +/// +/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp +/// Output: \verbinclude MatrixBase_reshaped_int_int.out +/// +/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix, +/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. +/// Here is an example with a fixed number of rows and columns: +/// \include MatrixBase_reshaped_fixed.cpp +/// Output: \verbinclude MatrixBase_reshaped_fixed.out +/// +/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example: +/// \include MatrixBase_reshaped_auto.cpp +/// Output: \verbinclude MatrixBase_reshaped_auto.out +/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and +/// that the other size is passed at compile-time using Eigen::fix as above. +/// +/// \sa class Reshaped, fix, fix(int) +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(NRowsType nRows, NColsType nCols); + +/// This is the const version of reshaped(NRowsType,NColsType). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped(NRowsType nRows, NColsType nCols) const; + +/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector +/// +/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), +/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. +/// +/// This overloads is essentially a shortcut for `A.reshaped(AutoSize,fix<1>)`. +/// +/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this. +/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this. +/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this. +/// This mode is the recommended one when the particular ordering of the element is not relevant. +/// +/// Example: +/// \include MatrixBase_reshaped_to_vector.cpp +/// Output: \verbinclude MatrixBase_reshaped_to_vector.out +/// +/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType). +/// +/// \sa reshaped(NRowsType,NColsType), class Reshaped +/// +template +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped(); + +/// This is the const version of reshaped(). +template +EIGEN_DEVICE_FUNC +inline const Reshaped +reshaped() const; + +#else + +// This file is automatically included twice to generate const and non-const versions + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_CONST const +#else +#define EIGEN_RESHAPED_METHOD_CONST +#endif + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS + +// This part is included once + +#endif + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> +reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped::value, + internal::get_compiletime_reshape_size::value, + internal::get_compiletime_reshape_order::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +} + +// Views as linear vectors + +EIGEN_DEVICE_FUNC +inline Reshaped +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + return Reshaped(derived(),size(),1); +} + +template +EIGEN_DEVICE_FUNC +inline Reshaped::value> +reshaped() EIGEN_RESHAPED_METHOD_CONST +{ + EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); + return Reshaped::value> + (derived(), size(), 1); +} + +#undef EIGEN_RESHAPED_METHOD_CONST + +#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS +#define EIGEN_RESHAPED_METHOD_2ND_PASS +#include "ReshapedMethods.h" +#undef EIGEN_RESHAPED_METHOD_2ND_PASS +#endif + +#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/externals/eigen/README.md b/externals/eigen/README.md index 4654a81c..9b40e9ed 100644 --- a/externals/eigen/README.md +++ b/externals/eigen/README.md @@ -1,3 +1,5 @@ **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** For more information go to http://eigen.tuxfamily.org/. + +For ***pull request***, ***bug reports***, and ***feature requests***, go to https://gitlab.com/libeigen/eigen. diff --git a/externals/eigen/cmake/ComputeCppCompilerChecks.cmake b/externals/eigen/cmake/ComputeCppCompilerChecks.cmake new file mode 100644 index 00000000..1807485e --- /dev/null +++ b/externals/eigen/cmake/ComputeCppCompilerChecks.cmake @@ -0,0 +1,50 @@ +cmake_minimum_required(VERSION 3.4.3) + +if(CMAKE_COMPILER_IS_GNUCXX) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) + message(FATAL_ERROR "host compiler - gcc version must be > 4.8") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) + message(FATAL_ERROR "host compiler - clang version must be > 3.6") + endif() +endif() + +if(MSVC) + set(ComputeCpp_STL_CHECK_SRC __STL_check) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp + "#include \n" + "int main() { return 0; }\n") + execute_process( + COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + -isystem ${ComputeCpp_INCLUDE_DIRS} + -o ${ComputeCpp_STL_CHECK_SRC}.sycl + -c ${ComputeCpp_STL_CHECK_SRC}.cpp + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT + ERROR_QUIET + OUTPUT_QUIET) + if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0) + # Try disabling compiler version checks + execute_process( + COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH + -isystem ${ComputeCpp_INCLUDE_DIRS} + -o ${ComputeCpp_STL_CHECK_SRC}.cpp.sycl + -c ${ComputeCpp_STL_CHECK_SRC}.cpp + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT + ERROR_QUIET + OUTPUT_QUIET) + if(NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0) + message(STATUS "Device compiler cannot consume hosted STL headers. Using any parts of the STL will likely result in device compiler errors.") + else() + message(STATUS "Device compiler does not meet certain STL version requirements. Disabling version checks and hoping for the best.") + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH) + endif() + endif() + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp + ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl) +endif(MSVC) diff --git a/externals/eigen/cmake/ComputeCppIRMap.cmake b/externals/eigen/cmake/ComputeCppIRMap.cmake new file mode 100644 index 00000000..942d91d6 --- /dev/null +++ b/externals/eigen/cmake/ComputeCppIRMap.cmake @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.4.3) + +# These should match the types of IR output by compute++ +set(IR_MAP_spir bc) +set(IR_MAP_spir64 bc) +set(IR_MAP_spir32 bc) +set(IR_MAP_spirv spv) +set(IR_MAP_spirv64 spv) +set(IR_MAP_spirv32 spv) +set(IR_MAP_aorta-x86_64 o) +set(IR_MAP_aorta-aarch64 o) +set(IR_MAP_aorta-rcar-cve o) +set(IR_MAP_custom-spir64 bc) +set(IR_MAP_custom-spir32 bc) +set(IR_MAP_custom-spirv64 spv) +set(IR_MAP_custom-spirv32 spv) +set(IR_MAP_ptx64 s) +set(IR_MAP_amdgcn s) diff --git a/externals/eigen/cmake/Eigen3Config.cmake.in b/externals/eigen/cmake/Eigen3Config.cmake.in new file mode 100644 index 00000000..0a1ac61c --- /dev/null +++ b/externals/eigen/cmake/Eigen3Config.cmake.in @@ -0,0 +1,23 @@ +# This file exports the Eigen3::Eigen CMake target which should be passed to the +# target_link_libraries command. + +@PACKAGE_INIT@ + +if (NOT TARGET eigen) + include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake") +endif () + +# Legacy variables, do *not* use. May be removed in the future. + +set (EIGEN3_FOUND 1) +set (EIGEN3_USE_FILE "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake") + +set (EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@") +set (EIGEN3_INCLUDE_DIR "@PACKAGE_EIGEN_INCLUDE_DIR@") +set (EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@") +set (EIGEN3_ROOT_DIR "@PACKAGE_EIGEN_ROOT_DIR@") + +set (EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@") +set (EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@") +set (EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@") +set (EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@") diff --git a/externals/eigen/cmake/Eigen3ConfigLegacy.cmake.in b/externals/eigen/cmake/Eigen3ConfigLegacy.cmake.in new file mode 100644 index 00000000..62d72246 --- /dev/null +++ b/externals/eigen/cmake/Eigen3ConfigLegacy.cmake.in @@ -0,0 +1,30 @@ +# -*- cmake -*- +# +# Eigen3Config.cmake(.in) + +# Use the following variables to compile and link against Eigen: +# EIGEN3_FOUND - True if Eigen was found on your system +# EIGEN3_USE_FILE - The file making Eigen usable +# EIGEN3_DEFINITIONS - Definitions needed to build with Eigen +# EIGEN3_INCLUDE_DIR - Directory where signature_of_eigen3_matrix_library can be found +# EIGEN3_INCLUDE_DIRS - List of directories of Eigen and it's dependencies +# EIGEN3_ROOT_DIR - The base directory of Eigen +# EIGEN3_VERSION_STRING - A human-readable string containing the version +# EIGEN3_VERSION_MAJOR - The major version of Eigen +# EIGEN3_VERSION_MINOR - The minor version of Eigen +# EIGEN3_VERSION_PATCH - The patch version of Eigen + +@PACKAGE_INIT@ + +set ( EIGEN3_FOUND 1 ) +set ( EIGEN3_USE_FILE "${CMAKE_CURRENT_LIST_DIR}/UseEigen3.cmake" ) + +set ( EIGEN3_DEFINITIONS "@EIGEN_DEFINITIONS@" ) +set ( EIGEN3_INCLUDE_DIR "@PACKAGE_EIGEN_INCLUDE_DIR@" ) +set ( EIGEN3_INCLUDE_DIRS "@PACKAGE_EIGEN_INCLUDE_DIR@" ) +set ( EIGEN3_ROOT_DIR "@PACKAGE_EIGEN_ROOT_DIR@" ) + +set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" ) +set ( EIGEN3_VERSION_MAJOR "@EIGEN_VERSION_MAJOR@" ) +set ( EIGEN3_VERSION_MINOR "@EIGEN_VERSION_MINOR@" ) +set ( EIGEN3_VERSION_PATCH "@EIGEN_VERSION_PATCH@" ) diff --git a/externals/eigen/cmake/EigenConfigureTesting.cmake b/externals/eigen/cmake/EigenConfigureTesting.cmake new file mode 100644 index 00000000..9cb3bb20 --- /dev/null +++ b/externals/eigen/cmake/EigenConfigureTesting.cmake @@ -0,0 +1,58 @@ +include(EigenTesting) +include(CheckCXXSourceCompiles) + +# configure the "site" and "buildname" +ei_set_sitename() + +# retrieve and store the build string +ei_set_build_string() + +add_custom_target(buildtests) +add_custom_target(check COMMAND "ctest") +add_dependencies(check buildtests) + +# check whether /bin/bash exists (disabled as not used anymore) +# find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) + +# This call activates testing and generates the DartConfiguration.tcl +include(CTest) + +set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests") +set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests") +set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out") + +# Overwrite default DartConfiguration.tcl such that ctest can build our unit tests. +# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target. +# At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable. +file(READ "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE) +# try to grab the default flags +string(REGEX MATCH "MakeCommand:.*-- (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE}) +if(NOT CMAKE_MATCH_1) +string(REGEX MATCH "MakeCommand:.*[^c]make (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE}) +endif() +string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target ${EIGEN_DASHBOARD_BUILD_TARGET} --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType" + EIGEN_DART_CONFIG_FILE2 ${EIGEN_DART_CONFIG_FILE}) +file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" ${EIGEN_DART_CONFIG_FILE2}) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CTestCustom.cmake.in ${CMAKE_BINARY_DIR}/CTestCustom.cmake) + +# some documentation of this function would be nice +ei_init_testing() + +# configure Eigen related testing options +option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF) +option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF) + +if(CMAKE_COMPILER_IS_GNUCXX) + option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF) + if(EIGEN_COVERAGE_TESTING) + set(COVERAGE_FLAGS "-fprofile-arcs -ftest-coverage") + set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/test/") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_FLAGS}") + endif() + +elseif(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS") +endif() + + diff --git a/externals/eigen/cmake/EigenDetermineOSVersion.cmake b/externals/eigen/cmake/EigenDetermineOSVersion.cmake new file mode 100644 index 00000000..9246fa67 --- /dev/null +++ b/externals/eigen/cmake/EigenDetermineOSVersion.cmake @@ -0,0 +1,46 @@ +# The utility function DetermineOSVersion aims at providing an +# improved version of the CMake variable ${CMAKE_SYSTEM} on Windows +# machines. +# +# Usage: +# include(EigenDetermineOSVersion) +# DetermineOSVersion(OS_VERSION) +# message("OS: ${OS_VERSION}") + +# - A little helper variable which should not be directly called +function(DetermineShortWindowsName WIN_VERSION win_num_version) + if (${win_num_version} VERSION_EQUAL "6.1") + set(_version "win7") + elseif(${win_num_version} VERSION_EQUAL "6.0") + set(_version "winVista") + elseif(${win_num_version} VERSION_EQUAL "5.2") + set(_version "winXpProf") + elseif(${win_num_version} VERSION_EQUAL "5.1") + set(_version "winXp") + elseif(${win_num_version} VERSION_EQUAL "5.0") + set(_version "win2000Prof") + else() + set(_version "unknownWin") + endif() + set(${WIN_VERSION} ${_version} PARENT_SCOPE) +endfunction() + +function(DetermineOSVersion OS_VERSION) + if (WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows) + file (TO_NATIVE_PATH "$ENV{COMSPEC}" SHELL) + exec_program( ${SHELL} ARGS "/c" "ver" OUTPUT_VARIABLE ver_output) + + string(REGEX MATCHALL "[0-9]+" + ver_list "${ver_output}") + list(GET ver_list 0 _major) + list(GET ver_list 1 _minor) + + set(win_num_version ${_major}.${_minor}) + DetermineShortWindowsName(win_version "${win_num_version}") + if(win_version) + set(${OS_VERSION} ${win_version} PARENT_SCOPE) + endif() + else() + set(${OS_VERSION} ${CMAKE_SYSTEM} PARENT_SCOPE) + endif() +endfunction() diff --git a/externals/eigen/cmake/EigenDetermineVSServicePack.cmake b/externals/eigen/cmake/EigenDetermineVSServicePack.cmake new file mode 100644 index 00000000..fed78194 --- /dev/null +++ b/externals/eigen/cmake/EigenDetermineVSServicePack.cmake @@ -0,0 +1,41 @@ +include(CMakeDetermineVSServicePack) + +# The code is almost identical to the CMake version. The only difference is that we remove +# _DetermineVSServicePack_FastCheckVersionWithCompiler which lead to errors on some systems. +function(EigenDetermineVSServicePack _pack) + if(NOT DETERMINED_VS_SERVICE_PACK OR NOT ${_pack}) + if(NOT DETERMINED_VS_SERVICE_PACK) + _DetermineVSServicePack_CheckVersionWithTryCompile(DETERMINED_VS_SERVICE_PACK _cl_version) + if(NOT DETERMINED_VS_SERVICE_PACK) + _DetermineVSServicePack_CheckVersionWithTryRun(DETERMINED_VS_SERVICE_PACK _cl_version) + endif() + endif() + + if(DETERMINED_VS_SERVICE_PACK) + if(_cl_version) + # Call helper function to determine VS version + _DetermineVSServicePackFromCompiler(_sp "${_cl_version}") + + # temporary fix, until CMake catches up + if (NOT _sp) + if(${_cl_version} VERSION_EQUAL "17.00.50727.1") + set(_sp "vc110") + elseif(${_cl_version} VERSION_EQUAL "17.00.51106.1") + set(_sp "vc110sp1") + elseif(${_cl_version} VERSION_EQUAL "17.00.60315.1") + set(_sp "vc110sp2") + elseif(${_cl_version} VERSION_EQUAL "17.00.60610.1") + set(_sp "vc110sp3") + else() + set(_sp ${CMAKE_CXX_COMPILER_VERSION}) + endif() + endif() + + if(_sp) + set(${_pack} ${_sp} CACHE INTERNAL + "The Visual Studio Release with Service Pack") + endif() + endif() + endif() + endif() +endfunction() diff --git a/externals/eigen/cmake/EigenSmokeTestList.cmake b/externals/eigen/cmake/EigenSmokeTestList.cmake new file mode 100644 index 00000000..6f0f7241 --- /dev/null +++ b/externals/eigen/cmake/EigenSmokeTestList.cmake @@ -0,0 +1,131 @@ +# List of tests that will be build and run during Eigen's smoke testing. If one +# of these tests doesn't exists or cannot be build with the current configuration +# it will just be skipped. +set(ei_smoke_test_list + adjoint_1 + alignedvector3 + array_cwise_7 + array_cwise_8 + array_for_matrix_1 + array_of_string + array_replicate_1 + array_reverse_1 + autodiff_1 + autodiff_scalar_1 + bandmatrix + bdcsvd_9 + bessel_functions_1 + bfloat16_float + blasutil_1 + block_5 + BVH + cholesky_1 + cholmod_support_23 + cholmod_support_24 + conservative_resize_1 + constructor_1 + corners_1 + ctorleakmiscmatrices_4 + dense_storage + determinant_1 + diagonal_1 + diagonal_2 + diagonalmatrices_1 + dynalloc + eigensolver_complex_1 + eigensolver_selfadjoint_8 + EulerAngles_1 + exceptions + fastmath + first_aligned + geo_alignedbox_2 + geo_eulerangles_1 + geo_homogeneous_1 + geo_hyperplane_1 + geo_orthomethods_1 + geo_parametrizedline_1 + geo_transformations_7 + half_float + hessenberg_1 + hessenberg_6qr_10 + householder_8 + indexed_view_1 + inplace_decomposition_1 + integer_types_1 + inverse_1 + is_same_dense + jacobi_1 + jacobisvd_1 + kronecker_product + linearstructure_1 + mapped_matrix_1 + mapstaticmethods_1 + mapstride_1 + matrix_square_root_1 + meta + minres_2 + miscmatrices_1 + mixingtypes_7 + nestbyvalue + nesting_ops_1 + nomalloc_1 + nullary_1 + num_dimensions + NumericalDiff + numext + packetmath + permutationmatrices_1 + polynomialsolver_1 + prec_inverse_4x4_1 + product_extra_5 + product_selfadjoint_1 + product_small_7 + product_symm_1 + product_syrk_1 + product_trmm_1 + product_trmv_1 + product_trsolve_5 + qr_1 + qr_colpivoting_7 + qr_fullpivoting_4 + rand + real_qz_1 + redux_1 + ref_1 + resize + rvalue_types_1 + schur_complex_1 + schur_real_1 + selfadjoint_1 + sizeof + sizeoverflow + smallvectors + sparse_basic_3 + sparse_block_1 + sparse_extra_4 + sparse_permutations_2 + sparse_product_4 + sparse_ref_1 + sparse_solvers_1 + sparse_vector_1 + special_functions_1 + special_numbers_1 + special_packetmath_1 + spqr_support_2 + stable_norm_1 + stddeque_1 + stddeque_overload_1 + stdlist_1 + stdlist_overload_1 + stdvector_1 + stdvector_overload_1 + stl_iterators_1 + swap_1 + symbolic_index_1 + triangular_1 + type_aliaslu_9 + umeyama_3 + unalignedassert + unalignedcount + vectorwiseop_1 + visitor_1) \ No newline at end of file diff --git a/externals/eigen/cmake/EigenTesting.cmake b/externals/eigen/cmake/EigenTesting.cmake new file mode 100644 index 00000000..eb8457db --- /dev/null +++ b/externals/eigen/cmake/EigenTesting.cmake @@ -0,0 +1,782 @@ + +macro(ei_add_property prop value) + get_property(previous GLOBAL PROPERTY ${prop}) + if ((NOT previous) OR (previous STREQUAL "")) + set_property(GLOBAL PROPERTY ${prop} "${value}") + else() + set_property(GLOBAL PROPERTY ${prop} "${previous} ${value}") + endif() +endmacro() + +#internal. See documentation of ei_add_test for details. +macro(ei_add_test_internal testname testname_with_suffix) + set(targetname ${testname_with_suffix}) + + if(EIGEN_ADD_TEST_FILENAME_EXTENSION) + set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) + else() + set(filename ${testname}.cpp) + endif() + + # Add the current target to the list of subtest targets + get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST) + set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n") + set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + + if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) + if(EIGEN_TEST_HIP) + hip_reset_flags() + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") + elseif(EIGEN_TEST_CUDA_CLANG) + set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) + + if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")) + link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") + else() + link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib") + endif() + + if (${ARGC} GREATER 2) + add_executable(${targetname} ${filename}) + else() + add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + endif() + set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread") + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt") + endif() + target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES}) + else() + if (${ARGC} GREATER 2) + cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) + else() + cuda_add_executable(${targetname} ${filename}) + endif() + endif() + else() + add_executable(${targetname} ${filename}) + endif() + + if (targetname MATCHES "^eigen2_") + add_dependencies(eigen2_buildtests ${targetname}) + else() + add_dependencies(buildtests ${targetname}) + endif() + + if(EIGEN_NO_ASSERTION_CHECKING) + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1") + else() + if(EIGEN_DEBUG_ASSERTS) + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1") + endif() + endif() + + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") + + if(MSVC) + ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") + endif() + + # let the user pass flags. + if(${ARGC} GREATER 2) + ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}") + endif() + + if(EIGEN_TEST_CUSTOM_CXX_FLAGS) + ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + endif() + + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() + if(EXTERNAL_LIBS) + target_link_libraries(${targetname} ${EXTERNAL_LIBS}) + endif() + if(EIGEN_TEST_CUSTOM_LINKER_FLAGS) + target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS}) + endif() + + if(${ARGC} GREATER 3) + set(libs_to_link ${ARGV3}) + # it could be that some cmake module provides a bad library string " " (just spaces), + # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors). + # so we check for strings containing only spaces. + string(STRIP "${libs_to_link}" libs_to_link_stripped) + string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length) + if(${libs_to_link_stripped_length} GREATER 0) + # notice: no double quotes around ${libs_to_link} here. It may be a list. + target_link_libraries(${targetname} ${libs_to_link}) + endif() + endif() + + add_test(${testname_with_suffix} "${targetname}") + + # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT + get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) + if ((current_subproject) AND (NOT (current_subproject STREQUAL ""))) + set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}") + add_dependencies("Build${current_subproject}" ${targetname}) + set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") + endif() + if(EIGEN_SYCL) + # Force include of the SYCL file at the end to avoid errors. + set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1) + # Set COMPILE_FLAGS to COMPILE_DEFINITIONS instead to avoid having to duplicate the flags + # to the device compiler. + get_target_property(target_compile_flags ${targetname} COMPILE_FLAGS) + separate_arguments(target_compile_flags) + foreach(flag ${target_compile_flags}) + if(${flag} MATCHES "^-D.*") + string(REPLACE "-D" "" definition_flag ${flag}) + set_property(TARGET ${targetname} APPEND PROPERTY COMPILE_DEFINITIONS ${definition_flag}) + list(REMOVE_ITEM target_compile_flags ${flag}) + endif() + endforeach() + set_property(TARGET ${targetname} PROPERTY COMPILE_FLAGS ${target_compile_flags}) + # Link against pthread and add sycl to target + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${targetname} Threads::Threads) + add_sycl_to_target(TARGET ${targetname} SOURCES ${filename}) + endif(EIGEN_SYCL) +endmacro(ei_add_test_internal) +# Macro to add a test +# +# the unique mandatory parameter testname must correspond to a file +# .cpp which follows this pattern: +# +# #include "main.h" +# void test_() { ... } +# +# Depending on the contents of that file, this macro can have 2 behaviors, +# see below. +# +# The optional 2nd parameter is libraries to link to. +# +# A. Default behavior +# +# this macro adds an executable as well as a ctest test +# named too. +# +# On platforms with bash simply run: +# "ctest -V" or "ctest -V -R " +# On other platform use ctest as usual +# +# B. Multi-part behavior +# +# If the source file matches the regexp +# CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+ +# then it is interpreted as a multi-part test. The behavior then depends on the +# CMake option EIGEN_SPLIT_LARGE_TESTS, which is ON by default. +# +# If EIGEN_SPLIT_LARGE_TESTS is OFF, the behavior is the same as in A (the multi-part +# aspect is ignored). +# +# If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables +# test__ +# where N runs from 1 to the greatest occurrence found in the source file. Each of these +# executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests +# into smaller executables. +# +# Moreover, targets are still generated, they +# have the effect of building all the parts of the test. +# +# Again, ctest -R allows to run all matching tests. +macro(ei_add_test testname) + get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST) + set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n") + set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + + if(EIGEN_ADD_TEST_FILENAME_EXTENSION) + set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) + else() + set(filename ${testname}.cpp) + endif() + + file(READ "${filename}" test_source) + string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") + list(REMOVE_DUPLICATES suffixes) + set(explicit_suffixes "") + if( (NOT EIGEN_SPLIT_LARGE_TESTS) AND suffixes) + # Check whether we have EIGEN_TEST_PART_* statements, in which case we likely must enforce splitting. + # For instance, indexed_view activate a different c++ version for each part. + string(REGEX MATCHALL "EIGEN_TEST_PART_[0-9]+" occurrences "${test_source}") + string(REGEX REPLACE "EIGEN_TEST_PART_" "" explicit_suffixes "${occurrences}") + list(REMOVE_DUPLICATES explicit_suffixes) + endif() + if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes) + add_custom_target(${testname}) + foreach(suffix ${suffixes}) + ei_add_test_internal(${testname} ${testname}_${suffix} + "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}") + add_dependencies(${testname} ${testname}_${suffix}) + endforeach() + else() + ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}") + endif() +endmacro() + +# adds a failtest, i.e. a test that succeed if the program fails to compile +# note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON +# so here we're just running CMake commands immediately, we're not adding any targets. +macro(ei_add_failtest testname) + + set(test_target_ok ${testname}_ok) + set(test_target_ko ${testname}_ko) + + # Add executables + add_executable(${test_target_ok} ${testname}.cpp) + add_executable(${test_target_ko} ${testname}.cpp) + + # Remove them from the normal build process + set_target_properties(${test_target_ok} ${test_target_ko} PROPERTIES + EXCLUDE_FROM_ALL TRUE + EXCLUDE_FROM_DEFAULT_BUILD TRUE) + + # Configure the failing test + target_compile_definitions(${test_target_ko} PRIVATE EIGEN_SHOULD_FAIL_TO_BUILD) + + # Add the tests to ctest. + add_test(NAME ${test_target_ok} + COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ok} --config $ + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_test(NAME ${test_target_ko} + COMMAND ${CMAKE_COMMAND} --build . --target ${test_target_ko} --config $ + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + + # Expect the second test to fail + set_tests_properties(${test_target_ko} PROPERTIES WILL_FAIL TRUE) +endmacro() + +# print a summary of the different options +macro(ei_testing_print_summary) + message(STATUS "************************************************************") + message(STATUS "*** Eigen's unit tests configuration summary ***") + message(STATUS "************************************************************") + message(STATUS "") + message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") + message(STATUS "Build site: ${SITE}") + message(STATUS "Build string: ${BUILDNAME}") + get_property(EIGEN_TESTING_SUMMARY GLOBAL PROPERTY EIGEN_TESTING_SUMMARY) + get_property(EIGEN_TESTED_BACKENDS GLOBAL PROPERTY EIGEN_TESTED_BACKENDS) + get_property(EIGEN_MISSING_BACKENDS GLOBAL PROPERTY EIGEN_MISSING_BACKENDS) + message(STATUS "Enabled backends: ${EIGEN_TESTED_BACKENDS}") + message(STATUS "Disabled backends: ${EIGEN_MISSING_BACKENDS}") + + if(EIGEN_DEFAULT_TO_ROW_MAJOR) + message(STATUS "Default order: Row-major") + else() + message(STATUS "Default order: Column-major") + endif() + + if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT) + message(STATUS "Explicit alignment (hence vectorization) disabled") + elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION) + message(STATUS "Explicit vectorization disabled (alignment kept enabled)") + else() + + message(STATUS "Maximal matrix/vector size: ${EIGEN_TEST_MAX_SIZE}") + + if(EIGEN_TEST_SSE2) + message(STATUS "SSE2: ON") + else() + message(STATUS "SSE2: Using architecture defaults") + endif() + + if(EIGEN_TEST_SSE3) + message(STATUS "SSE3: ON") + else() + message(STATUS "SSE3: Using architecture defaults") + endif() + + if(EIGEN_TEST_SSSE3) + message(STATUS "SSSE3: ON") + else() + message(STATUS "SSSE3: Using architecture defaults") + endif() + + if(EIGEN_TEST_SSE4_1) + message(STATUS "SSE4.1: ON") + else() + message(STATUS "SSE4.1: Using architecture defaults") + endif() + + if(EIGEN_TEST_SSE4_2) + message(STATUS "SSE4.2: ON") + else() + message(STATUS "SSE4.2: Using architecture defaults") + endif() + + if(EIGEN_TEST_AVX) + message(STATUS "AVX: ON") + else() + message(STATUS "AVX: Using architecture defaults") + endif() + + if(EIGEN_TEST_AVX2) + message(STATUS "AVX2: ON") + else() + message(STATUS "AVX2: Using architecture defaults") + endif() + + if(EIGEN_TEST_FMA) + message(STATUS "FMA: ON") + else() + message(STATUS "FMA: Using architecture defaults") + endif() + + if(EIGEN_TEST_AVX512) + message(STATUS "AVX512: ON") + else() + message(STATUS "AVX512: Using architecture defaults") + endif() + + if(EIGEN_TEST_AVX512DQ) + message(STATUS "AVX512DQ: ON") + else() + message(STATUS "AVX512DQ: Using architecture defaults") + endif() + + if(EIGEN_TEST_ALTIVEC) + message(STATUS "Altivec: ON") + else() + message(STATUS "Altivec: Using architecture defaults") + endif() + + if(EIGEN_TEST_VSX) + message(STATUS "VSX: ON") + else() + message(STATUS "VSX: Using architecture defaults") + endif() + + if(EIGEN_TEST_MSA) + message(STATUS "MIPS MSA: ON") + else() + message(STATUS "MIPS MSA: Using architecture defaults") + endif() + + if(EIGEN_TEST_NEON) + message(STATUS "ARM NEON: ON") + else() + message(STATUS "ARM NEON: Using architecture defaults") + endif() + + if(EIGEN_TEST_NEON64) + message(STATUS "ARMv8 NEON: ON") + else() + message(STATUS "ARMv8 NEON: Using architecture defaults") + endif() + + if(EIGEN_TEST_ZVECTOR) + message(STATUS "S390X ZVECTOR: ON") + else() + message(STATUS "S390X ZVECTOR: Using architecture defaults") + endif() + + if(EIGEN_TEST_CXX11) + message(STATUS "C++11: ON") + else() + message(STATUS "C++11: OFF") + endif() + + if(EIGEN_TEST_SYCL) + if(EIGEN_SYCL_TRISYCL) + message(STATUS "SYCL: ON (using triSYCL)") + else() + message(STATUS "SYCL: ON (using computeCPP)") + endif() + else() + message(STATUS "SYCL: OFF") + endif() + if(EIGEN_TEST_CUDA) + if(EIGEN_TEST_CUDA_CLANG) + message(STATUS "CUDA: ON (using clang)") + else() + message(STATUS "CUDA: ON (using nvcc)") + endif() + else() + message(STATUS "CUDA: OFF") + endif() + if(EIGEN_TEST_HIP) + message(STATUS "HIP: ON (using hipcc)") + else() + message(STATUS "HIP: OFF") + endif() + + endif() # vectorization / alignment options + + message(STATUS "\n${EIGEN_TESTING_SUMMARY}") + + message(STATUS "************************************************************") +endmacro() + +macro(ei_init_testing) + define_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_TESTS_LIST BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST BRIEF_DOCS " " FULL_DOCS " ") + + set_property(GLOBAL PROPERTY EIGEN_TESTED_BACKENDS "") + set_property(GLOBAL PROPERTY EIGEN_MISSING_BACKENDS "") + set_property(GLOBAL PROPERTY EIGEN_TESTING_SUMMARY "") + set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "") + set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "") + + define_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT BRIEF_DOCS " " FULL_DOCS " ") + define_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT BRIEF_DOCS " " FULL_DOCS " ") + + set_property(GLOBAL PROPERTY EIGEN_FAILTEST_FAILURE_COUNT "0") + set_property(GLOBAL PROPERTY EIGEN_FAILTEST_COUNT "0") + + # uncomment anytime you change the ei_get_compilerver_from_cxx_version_string macro + # ei_test_get_compilerver_from_cxx_version_string() +endmacro() + +macro(ei_set_sitename) + # if the sitename is not yet set, try to set it + if(NOT ${SITE} OR ${SITE} STREQUAL "") + set(eigen_computername $ENV{COMPUTERNAME}) + set(eigen_hostname $ENV{HOSTNAME}) + if(eigen_hostname) + set(SITE ${eigen_hostname}) + elseif(eigen_computername) + set(SITE ${eigen_computername}) + endif() + endif() + # in case it is already set, enforce lower case + if(SITE) + string(TOLOWER ${SITE} SITE) + endif() +endmacro() + +macro(ei_get_compilerver VAR) + if(MSVC) + # on windows system, we use a modified CMake script + include(EigenDetermineVSServicePack) + EigenDetermineVSServicePack( my_service_pack ) + + if( my_service_pack ) + set(${VAR} ${my_service_pack}) + else() + set(${VAR} "na") + endif() + elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI") + set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") + else() + # on all other system we rely on ${CMAKE_CXX_COMPILER} + # supporting a "--version" or "/version" flag + + if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} EQUAL "Intel") + set(EIGEN_CXX_FLAG_VERSION "/version") + else() + set(EIGEN_CXX_FLAG_VERSION "--version") + endif() + + execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION} + OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) + string(REGEX REPLACE "[\n\r].*" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) + + ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER) + set(${VAR} "${CNAME}-${CVER}") + + endif() +endmacro() + +# Extract compiler name and version from a raw version string +# WARNING: if you edit this macro, then please test it by uncommenting +# the testing macro call in ei_init_testing() of the EigenTesting.cmake file. +# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end +# of the file +macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) + # extract possible compiler names + string(REGEX MATCH "g\\+\\+" ei_has_gpp ${VERSTRING}) + string(REGEX MATCH "llvm|LLVM" ei_has_llvm ${VERSTRING}) + string(REGEX MATCH "gcc|GCC" ei_has_gcc ${VERSTRING}) + string(REGEX MATCH "icpc|ICC" ei_has_icpc ${VERSTRING}) + string(REGEX MATCH "clang|CLANG" ei_has_clang ${VERSTRING}) + string(REGEX MATCH "mingw32" ei_has_mingw ${VERSTRING}) + + # combine them + if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc)) + set(${CNAME} "llvm-g++") + elseif((ei_has_llvm) AND (ei_has_clang)) + set(${CNAME} "llvm-clang++") + elseif(ei_has_clang) + set(${CNAME} "clang++") + elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc)) + set(${CNAME} "mingw32-g++") + elseif(ei_has_icpc) + set(${CNAME} "icpc") + elseif(ei_has_gpp OR ei_has_gcc) + set(${CNAME} "g++") + else() + set(${CNAME} "_") + endif() + + # extract possible version numbers + # first try to extract 3 isolated numbers: + string(REGEX MATCH " [0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING}) + if(NOT eicver) + # try to extract 2 isolated ones: + string(REGEX MATCH " [0-9]+\\.[0-9]+" eicver ${VERSTRING}) + if(NOT eicver) + # try to extract 3: + string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+\\.[0-9]+" eicver ${VERSTRING}) + if(NOT eicver) + # try to extract 2: + string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING}) + if (NOT eicver AND ei_has_mingw) + # try to extract 1 number plus suffix: + string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING}) + endif() + endif() + endif() + endif() + + if (NOT eicver) + set(eicver " _") + endif() + + string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver}) + +endmacro() + +macro(ei_get_cxxflags VAR) + set(${VAR} "") + ei_is_64bit_env(IS_64BIT_ENV) + if(EIGEN_TEST_NEON) + set(${VAR} NEON) + elseif(EIGEN_TEST_NEON64) + set(${VAR} NEON) + elseif(EIGEN_TEST_ZVECTOR) + set(${VAR} ZVECTOR) + elseif(EIGEN_TEST_VSX) + set(${VAR} VSX) + elseif(EIGEN_TEST_ALTIVEC) + set(${VAR} ALVEC) + elseif(EIGEN_TEST_FMA) + set(${VAR} FMA) + elseif(EIGEN_TEST_AVX) + set(${VAR} AVX) + elseif(EIGEN_TEST_SSE4_2) + set(${VAR} SSE42) + elseif(EIGEN_TEST_SSE4_1) + set(${VAR} SSE41) + elseif(EIGEN_TEST_SSSE3) + set(${VAR} SSSE3) + elseif(EIGEN_TEST_SSE3) + set(${VAR} SSE3) + elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV) + set(${VAR} SSE2) + elseif(EIGEN_TEST_MSA) + set(${VAR} MSA) + endif() + + if(EIGEN_TEST_OPENMP) + if (${VAR} STREQUAL "") + set(${VAR} OMP) + else() + set(${VAR} ${${VAR}}-OMP) + endif() + endif() + + if(EIGEN_DEFAULT_TO_ROW_MAJOR) + if (${VAR} STREQUAL "") + set(${VAR} ROW) + else() + set(${VAR} ${${VAR}}-ROWMAJ) + endif() + endif() +endmacro() + +macro(ei_set_build_string) + ei_get_compilerver(LOCAL_COMPILER_VERSION) + ei_get_cxxflags(LOCAL_COMPILER_FLAGS) + + include(EigenDetermineOSVersion) + DetermineOSVersion(OS_VERSION) + + set(TMP_BUILD_STRING ${OS_VERSION}-${LOCAL_COMPILER_VERSION}) + + if (NOT ${LOCAL_COMPILER_FLAGS} STREQUAL "") + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS}) + endif() + + if(EIGEN_TEST_EXTERNAL_BLAS) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas) + endif() + + ei_is_64bit_env(IS_64BIT_ENV) + if(NOT IS_64BIT_ENV) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit) + else() + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-64bit) + endif() + + if(EIGEN_TEST_CXX11) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-cxx11) + endif() + + if(EIGEN_BUILD_STRING_SUFFIX) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${EIGEN_BUILD_STRING_SUFFIX}) + endif() + + string(TOLOWER ${TMP_BUILD_STRING} BUILDNAME) +endmacro() + +macro(ei_is_64bit_env VAR) + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(${VAR} 1) + elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(${VAR} 0) + else() + message(WARNING "Unsupported pointer size. Please contact the authors.") + endif() +endmacro() + + +# helper macro for testing ei_get_compilerver_from_cxx_version_string +# STR: raw version string +# REFNAME: expected compiler name +# REFVER: expected compiler version +macro(ei_test1_get_compilerver_from_cxx_version_string STR REFNAME REFVER) + ei_get_compilerver_from_cxx_version_string(${STR} CNAME CVER) + if((NOT ${REFNAME} STREQUAL ${CNAME}) OR (NOT ${REFVER} STREQUAL ${CVER})) + message("STATUS ei_get_compilerver_from_cxx_version_string error:") + message("Expected \"${REFNAME}-${REFVER}\", got \"${CNAME}-${CVER}\"") + endif() +endmacro() + +# macro for testing ei_get_compilerver_from_cxx_version_string +# feel free to add more version strings +macro(ei_test_get_compilerver_from_cxx_version_string) + ei_test1_get_compilerver_from_cxx_version_string("g++ (SUSE Linux) 4.5.3 20110428 [gcc-4_5-branch revision 173117]" "g++" "4.5.3") + ei_test1_get_compilerver_from_cxx_version_string("c++ (GCC) 4.5.1 20100924 (Red Hat 4.5.1-4)" "g++" "4.5.1") + ei_test1_get_compilerver_from_cxx_version_string("icpc (ICC) 11.0 20081105" "icpc" "11.0") + ei_test1_get_compilerver_from_cxx_version_string("g++-3.4 (GCC) 3.4.6" "g++" "3.4.6") + ei_test1_get_compilerver_from_cxx_version_string("SUSE Linux clang version 3.0 (branches/release_30 145598) (based on LLVM 3.0)" "llvm-clang++" "3.0") + ei_test1_get_compilerver_from_cxx_version_string("icpc (ICC) 12.0.5 20110719" "icpc" "12.0.5") + ei_test1_get_compilerver_from_cxx_version_string("Apple clang version 2.1 (tags/Apple/clang-163.7.1) (based on LLVM 3.0svn)" "llvm-clang++" "2.1") + ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1") + ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6") + ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4") + ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32") +endmacro() + +# Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets +# named buildtestspartN with N = { 0, ..., num_splits-1}. +# +# The intention behind the existance of this macro is the size of Eigen's +# testsuite. Together with the relativly big compile-times building all tests +# can take a substantial amount of time depending on the available hardware. +# +# The last buildtestspartN target will build possible remaining tests. +# +# An example: +# +# EIGEN_TESTS_LIST= [ test1, test2, test3, test4, test5, test6, test7 ] +# +# A call to ei_split_testsuite(3) creates the following targets with dependencies +# +# Target Dependencies +# ------ ------------ +# buildtestspart0 test1, test2 +# buildtestspart1 test3, test4 +# buildtestspart2 test5, test6, test7 +# +macro(ei_split_testsuite num_splits) + get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST) + + # Translate EIGEN_TESTS_LIST into a CMake list + string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + separate_arguments(EIGEN_TESTS_LIST) + + set(eigen_test_count "0") + foreach(t IN ITEMS ${EIGEN_TESTS_LIST}) + math(EXPR eigen_test_count "${eigen_test_count}+1") + endforeach() + + # Get number of tests per target + math(EXPR num_tests_per_target "${eigen_test_count}/${num_splits} - ${eigen_test_count}/${num_splits} % 1") + + set(test_idx "0") + math(EXPR target_bound "${num_splits}-1") + foreach(part RANGE "0" "${target_bound}") + # Create target + set(current_target "buildtestspart${part}") + add_custom_target("${current_target}") + math(EXPR upper_bound "${test_idx} + ${num_tests_per_target} - 1") + foreach(test_idx RANGE "${test_idx}" "${upper_bound}") + list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test) + add_dependencies("${current_target}" "${curr_test}") + endforeach() + math(EXPR test_idx "${test_idx} + ${num_tests_per_target}") + endforeach() + + # Handle the possibly remaining tests + math(EXPR test_idx "${num_splits} * ${num_tests_per_target}") + math(EXPR target_bound "${eigen_test_count} - 1") + foreach(test_idx RANGE "${test_idx}" "${target_bound}") + list(GET EIGEN_TESTS_LIST "${test_idx}" curr_test) + add_dependencies("${current_target}" "${curr_test}") + endforeach() +endmacro(ei_split_testsuite num_splits) + +# Defines the custom command buildsmoketests to build a number of tests +# specified in smoke_test_list. +# +# Test in smoke_test_list can be either test targets (e.g. packetmath) or +# subtests targets (e.g. packetmath_2). If any of the test are not available +# in the current configuration they are just skipped. +# +# All tests added via this macro are labeled with the smoketest label. This +# allows running smoketests only using ctest. +# +# Smoke tests are intended to be run before the whole test suite is invoked, +# e.g., to smoke test patches. +macro(ei_add_smoke_tests smoke_test_list) + # Set the build target to build smoketests + set(buildtarget "buildsmoketests") + add_custom_target("${buildtarget}") + + # Get list of all tests and translate it into a CMake list + get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST) + string(REGEX REPLACE "\n" " " EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + separate_arguments(EIGEN_TESTS_LIST) + + # Check if the test in smoke_test_list is a currently valid test target + foreach(test IN ITEMS ${smoke_test_list}) + # Add tests in smoke_test_list to our smoke test target but only if the test + # is currently available, i.e., is in EIGEN_SUBTESTS_LIST + if ("${test}" IN_LIST EIGEN_TESTS_LIST) + add_dependencies("${buildtarget}" "${test}") + # In the case of a test we match all subtests + set(ctest_regex "${ctest_regex}^${test}_[0-9]+$$|") + endif() + endforeach() + + # Get list of all subtests and translate it into a CMake list + get_property(EIGEN_SUBTESTS_LIST GLOBAL PROPERTY EIGEN_SUBTESTS_LIST) + string(REGEX REPLACE "\n" " " EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + separate_arguments(EIGEN_SUBTESTS_LIST) + + # Check if the test in smoke_test_list is a currently valid subtest target + foreach(test IN ITEMS ${smoke_test_list}) + # Add tests in smoke_test_list to our smoke test target but only if the test + # is currently available, i.e., is in EIGEN_SUBTESTS_LIST + if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST) + add_dependencies("${buildtarget}" "${test}") + # Add label smoketest to be able to run smoketests using ctest + get_property(test_labels TEST ${test} PROPERTY LABELS) + set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest") + endif() + endforeach() +endmacro(ei_add_smoke_tests) diff --git a/externals/eigen/cmake/EigenUninstall.cmake b/externals/eigen/cmake/EigenUninstall.cmake new file mode 100644 index 00000000..5e63c98d --- /dev/null +++ b/externals/eigen/cmake/EigenUninstall.cmake @@ -0,0 +1,40 @@ +################ CMake Uninstall Template ####################### +# CMake Template file for uninstallation of files +# mentioned in 'install_manifest.txt' +# +# Used by uinstall target +################################################################# + +set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") + +if(EXISTS ${MANIFEST}) + message(STATUS "============== Uninstalling Eigen ===================") + + file(STRINGS ${MANIFEST} files) + foreach(file ${files}) + if(EXISTS ${file}) + message(STATUS "Removing file: '${file}'") + + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove ${file} + OUTPUT_VARIABLE rm_out + RESULT_VARIABLE rm_retval + ) + + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Failed to remove file: '${file}'.") + endif() + else() + message(STATUS "File '${file}' does not exist.") + endif() + endforeach() + + message(STATUS "========== Finished Uninstalling Eigen ==============") +else() + message(STATUS "Cannot find install manifest: '${MANIFEST}'") + message(STATUS "Probably make install has not been performed") + message(STATUS " or install_manifest.txt has been deleted.") +endif() + + + diff --git a/externals/eigen/cmake/FindAdolc.cmake b/externals/eigen/cmake/FindAdolc.cmake new file mode 100644 index 00000000..13c59fcf --- /dev/null +++ b/externals/eigen/cmake/FindAdolc.cmake @@ -0,0 +1,20 @@ + +if (ADOLC_INCLUDES AND ADOLC_LIBRARIES) + set(ADOLC_FIND_QUIETLY TRUE) +endif () + +find_path(ADOLC_INCLUDES + NAMES adolc/adtl.h + PATHS $ENV{ADOLCDIR} $ENV{ADOLCDIR}/include ${INCLUDE_INSTALL_DIR} +) + +find_library(ADOLC_LIBRARIES + adolc + PATHS $ENV{ADOLCDIR} ${LIB_INSTALL_DIR} + PATH_SUFFIXES lib lib64) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Adolc DEFAULT_MSG + ADOLC_INCLUDES ADOLC_LIBRARIES) + +mark_as_advanced(ADOLC_INCLUDES ADOLC_LIBRARIES) diff --git a/externals/eigen/cmake/FindBLAS.cmake b/externals/eigen/cmake/FindBLAS.cmake new file mode 100644 index 00000000..1bb8f196 --- /dev/null +++ b/externals/eigen/cmake/FindBLAS.cmake @@ -0,0 +1,1407 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS library +# This module finds an installed fortran library that implements the BLAS +# linear-algebra interface (see http://www.netlib.org/blas/). +# The list of libraries searched for is taken +# from the autoconf macro file, acx_blas.m4 (distributed at +# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html). +# +# This module sets the following variables: +# BLAS_FOUND - set to true if a library implementing the BLAS interface +# is found +# BLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l +# and -L). +# BLAS_COMPILER_FLAGS - uncached list of required compiler flags (including -I for mkl headers). +# BLAS_LIBRARIES - uncached list of libraries (using full path name) to +# link against to use BLAS +# BLAS95_LIBRARIES - uncached list of libraries (using full path name) +# to link against to use BLAS95 interface +# BLAS95_FOUND - set to true if a library implementing the BLAS f95 interface +# is found +# BLA_STATIC if set on this determines what kind of linkage we do (static) +# BLA_VENDOR if set checks only the specified vendor, if not set checks +# all the possibilities +# BLAS_VENDOR_FOUND stores the BLAS vendor found +# BLA_F95 if set on tries to find the f95 interfaces for BLAS/LAPACK +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DBLAS_DIR=path/to/blas): +# BLAS_DIR - Where to find the base directory of blas +# BLAS_INCDIR - Where to find the header files +# BLAS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: BLAS_DIR, BLAS_INCDIR, BLAS_LIBDIR +# For MKL case and if no paths are given as hints, we will try to use the MKLROOT +# environment variable +# BLAS_VERBOSE Print some additional information during BLAS libraries detection +########## +### List of vendors (BLA_VENDOR) valid in this module +########## List of vendors (BLA_VENDOR) valid in this module +## Open (for OpenBlas), Eigen (for EigenBlas), Goto, ATLAS PhiPACK, +##  CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT +## Intel10_32 (intel mkl v10 32 bit), Intel10_64lp (intel mkl v10 64 bit,lp thread model, lp64 model), +## Intel10_64lp_seq (intel mkl v10 64 bit,sequential code, lp64 model), +## Intel( older versions of mkl 32 and 64 bit), +##  ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic +# C/CXX should be enabled to use Intel mkl +### +# We handle different modes to find the dependency +# +# - Detection if already installed on the system +# - BLAS libraries can be detected from different ways +# Here is the order of precedence: +# 1) we look in cmake variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 2) we look in environment variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 3) we look in common environnment variables depending on the system (INCLUDE, C_INCLUDE_PATH, CPATH - LIB, DYLD_LIBRARY_PATH, LD_LIBRARY_PATH) +# 4) we look in common system paths depending on the system, see for example paths contained in the following cmake variables: +# - CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES +# - CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_C_IMPLICIT_LINK_DIRECTORIES +# + +#============================================================================= +# Copyright 2007-2009 Kitware, Inc. +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +## Some macros to print status when search for headers and libs +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_Status _libname _lib_to_find) + + # save _libname upper/lower case + string(TOUPPER ${_libname} LIBNAME) + string(TOLOWER ${_libname} libname) + + # print status + #message(" ") + if(${LIBNAME}_LIBDIR) + message("${Yellow}${LIBNAME}_LIBDIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + if(${LIBNAME}_DIR) + message("${Yellow}${LIBNAME}_DIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + message("${Yellow}${_lib_to_find} not found." + "Nor ${LIBNAME}_DIR neither ${LIBNAME}_LIBDIR" + "are defined so that we look for ${_lib_to_find} in" + "system paths (Linux: LD_LIBRARY_PATH, Windows: LIB," + "Mac: DYLD_LIBRARY_PATH," + "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES," + "CMAKE_C_IMPLICIT_LINK_DIRECTORIES)${ColourReset}") + if(_lib_env) + message("${Yellow}${_lib_to_find} has not been found in" + "${_lib_env}${ColourReset}") + endif() + endif() + endif() + message("${BoldYellow}Please indicate where to find ${_lib_to_find}. You have three options:\n" + "- Option 1: Provide the Installation directory of BLAS library with cmake option: -D${LIBNAME}_DIR=your/path/to/${libname}/\n" + "- Option 2: Provide the directory where to find the library with cmake option: -D${LIBNAME}_LIBDIR=your/path/to/${libname}/lib/\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "- Option 4: If your library provides a PkgConfig file, make sure pkg-config finds your library${ColourReset}") + +endmacro() + +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_CheckFunc_Status _name) + + # save _libname upper/lower case + string(TOUPPER ${_name} FUNCNAME) + string(TOLOWER ${_name} funcname) + + # print status + #message(" ") + message("${Red}Libs have been found but check of symbol ${_name} failed " + "with following libraries ${ARGN}${ColourReset}") + message("${BoldRed}Please open your error file CMakeFiles/CMakeError.log" + "to figure out why it fails${ColourReset}") + #message(" ") + +endmacro() + +if (NOT BLAS_FOUND) + set(BLAS_DIR "" CACHE PATH "Installation directory of BLAS library") + if (NOT BLAS_FIND_QUIETLY) + message(STATUS "A cache variable, namely BLAS_DIR, has been set to specify the install directory of BLAS") + endif() +endif() + +option(BLAS_VERBOSE "Print some additional information during BLAS libraries detection" OFF) +mark_as_advanced(BLAS_VERBOSE) + +include(CheckFunctionExists) +include(CheckFortranFunctionExists) +include(CMakeFindDependencyMacro) + +set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + +# Check the language being used +get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES ) +if( _LANGUAGES_ MATCHES Fortran AND CMAKE_Fortran_COMPILER) + set( _CHECK_FORTRAN TRUE ) +elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) ) + set( _CHECK_FORTRAN FALSE ) +else() + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.") + else() + message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)") + return() + endif() +endif() + +macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) + # This macro checks for the existence of the combination of fortran libraries + # given by _list. If the combination is found, this macro checks (using the + # Check_Fortran_Function_Exists macro) whether can link against that library + # combination using the name of a routine given by _name using the linker + # flags given by _flags. If the combination of libraries is found and passes + # the link test, LIBRARIES is set to the list of complete library paths that + # have been found. Otherwise, LIBRARIES is set to FALSE. + + # N.B. _prefix is the prefix applied to the names of all cached variables that + # are generated internally and marked advanced by this macro. + + set(_libdir ${ARGN}) + + set(_libraries_work TRUE) + set(${LIBRARIES}) + set(_combined_name) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_LIBDIR "$ENV{BLAS_LIBDIR}") + if (NOT _libdir) + if (BLAS_LIBDIR) + list(APPEND _libdir "${BLAS_LIBDIR}") + elseif (BLAS_DIR) + list(APPEND _libdir "${BLAS_DIR}") + list(APPEND _libdir "${BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${BLAS_DIR}/lib64") + list(APPEND _libdir "${BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${BLAS_DIR}/lib32") + list(APPEND _libdir "${BLAS_DIR}/lib/ia32") + endif() + elseif(ENV_BLAS_LIBDIR) + list(APPEND _libdir "${ENV_BLAS_LIBDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _libdir "${ENV_BLAS_DIR}") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib64") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${ENV_BLAS_DIR}/lib32") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/ia32") + endif() + else() + if (ENV_MKLROOT) + list(APPEND _libdir "${ENV_MKLROOT}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_MKLROOT}/lib64") + list(APPEND _libdir "${ENV_MKLROOT}/lib/intel64") + else() + list(APPEND _libdir "${ENV_MKLROOT}/lib32") + list(APPEND _libdir "${ENV_MKLROOT}/lib/ia32") + endif() + endif() + if (WIN32) + string(REPLACE ":" ";" _libdir2 "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir2 "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir2 "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${_libdir2}") + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif () + + if (BLAS_VERBOSE) + message("${Cyan}Try to find BLAS libraries: ${_list}") + endif () + + foreach(_library ${_list}) + set(_combined_name ${_combined_name}_${_library}) + + if(_libraries_work) + if (BLA_STATIC) + if (WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + if (APPLE) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else () + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + else () + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # for ubuntu's libblas3gf and liblapack3gf packages + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) + endif () + endif () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + HINTS ${_libdir} + NO_DEFAULT_PATH + ) + mark_as_advanced(${_prefix}_${_library}_LIBRARY) + # Print status if not found + # ------------------------- + if (NOT ${_prefix}_${_library}_LIBRARY AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_Status(blas ${_library} ${_libdir}) + endif () + set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) + set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) + endif() + endforeach() + + if(_libraries_work) + # Test this combination of libraries. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) + list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") + list(APPEND ${LIBRARIES} "-Wl,--end-group") + endif() + set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}") + set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}") + if (BLAS_VERBOSE) + message("${Cyan}BLAS libs found for BLA_VENDOR ${BLA_VENDOR}." + "Try to compile symbol ${_name} with following libraries:" + "${CMAKE_REQUIRED_LIBRARIES}") + endif () + if(NOT BLAS_FOUND) + unset(${_prefix}${_combined_name}_WORKS CACHE) + endif() + if (_CHECK_FORTRAN) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + string(REPLACE "mkl_intel_lp64" "mkl_gf_lp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + string(REPLACE "mkl_intel_ilp64" "mkl_gf_ilp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + endif() + check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS) + else() + check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) + endif() + mark_as_advanced(${_prefix}${_combined_name}_WORKS) + set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) + # Print status if not found + # ------------------------- + if (NOT _libraries_work AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_CheckFunc_Status(${_name} ${CMAKE_REQUIRED_LIBRARIES}) + endif () + set(CMAKE_REQUIRED_LIBRARIES) + endif() + + if(_libraries_work) + set(${LIBRARIES} ${${LIBRARIES}} ${_thread}) + else() + set(${LIBRARIES} FALSE) + endif() + +endmacro() + + +set(BLAS_LINKER_FLAGS) +set(BLAS_LIBRARIES) +set(BLAS95_LIBRARIES) +if ($ENV{BLA_VENDOR} MATCHES ".+") + set(BLA_VENDOR $ENV{BLA_VENDOR}) +else () + if(NOT BLA_VENDOR) + set(BLA_VENDOR "All") + endif() +endif () + +#BLAS in intel mkl 10 library? (em64t 64bit) +if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_INCDIR "$ENV{BLAS_INCDIR}") + if(ENV_BLAS_INCDIR) + list(APPEND _inc_env "${ENV_BLAS_INCDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _inc_env "${ENV_BLAS_DIR}") + list(APPEND _inc_env "${ENV_BLAS_DIR}/include") + else() + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + # system variables + if(WIN32) + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") + + # Try to find the fftw header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(BLAS_INCDIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES "include") + else() + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_DIRS) + + # If found, add path to cmake variable + # ------------------------------------ + if (BLAS_mkl.h_DIRS) + set(BLAS_INCLUDE_DIRS "${BLAS_mkl.h_DIRS}") + else () + set(BLAS_INCLUDE_DIRS "BLAS_INCLUDE_DIRS-NOTFOUND") + if(NOT BLAS_FIND_QUIETLY) + message(STATUS "Looking for BLAS -- mkl.h not found") + endif() + endif() + + if (WIN32) + string(REPLACE ":" ";" _libdir "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + # libiomp5 + # -------- + set(OMP_iomp5_LIBRARY "OMP_iomp5_LIBRARY-NOTFOUND") + find_library(OMP_iomp5_LIBRARY + NAMES iomp5 + HINTS ${_libdir} + ) + mark_as_advanced(OMP_iomp5_LIBRARY) + set(OMP_LIB "") + # libgomp + # ------- + set(OMP_gomp_LIBRARY "OMP_gomp_LIBRARY-NOTFOUND") + find_library(OMP_gomp_LIBRARY + NAMES gomp + HINTS ${_libdir} + ) + mark_as_advanced(OMP_gomp_LIBRARY) + # choose one or another depending on the compilo + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (OMP_gomp_LIBRARY) + set(OMP_LIB "${OMP_gomp_LIBRARY}") + endif() + else() + if (OMP_iomp5_LIBRARY) + set(OMP_LIB "${OMP_iomp5_LIBRARY}") + endif() + endif() + + if (UNIX AND NOT WIN32) + # m + find_library(M_LIBRARY + NAMES m + HINTS ${_libdir}) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + set(LM "-lm") + else() + set(LM "") + endif() + # Fortran + set(LGFORTRAN "") + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + set(LGFORTRAN "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + set(LGFORTRAN "{FORTRAN_ifcore_LIBRARY}") + endif() + endif() + set(BLAS_COMPILER_FLAGS "") + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_COMPILER_FLAGS "-openmp") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_COMPILER_FLAGS "-fopenmp") + endif() + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (BLA_VENDOR STREQUAL "Intel10_32") + list(APPEND BLAS_COMPILER_FLAGS "-m32") + else() + list(APPEND BLAS_COMPILER_FLAGS "-m64") + endif() + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + list(APPEND OMP_LIB "-ldl") + endif() + if (ENV_MKLROOT) + list(APPEND BLAS_COMPILER_FLAGS "-I${ENV_MKLROOT}/include") + endif() + endif() + + set(additional_flags "") + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(additional_flags "-Wl,--no-as-needed") + endif() + endif () + + if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) + if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) + find_dependency(Threads) + else() + find_dependency(Threads REQUIRED) + endif() + + set(BLAS_SEARCH_LIBS "") + + if(BLA_F95) + + set(BLAS_mkl_SEARCH_SYMBOL SGEMM) + set(_LIBRARIES BLAS95_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95_lp64${BLAS_mkl_DLL_SUFFIX} mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else () + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + endif () + + else () + + set(BLAS_mkl_SEARCH_SYMBOL sgemm) + set(_LIBRARIES BLAS_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else () + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + #older vesions of intel mkl libs + if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl") + list(APPEND BLAS_SEARCH_LIBS + "mkl_ia32") + list(APPEND BLAS_SEARCH_LIBS + "mkl_em64t") + endif () + endif () + + endif () + + foreach (IT ${BLAS_SEARCH_LIBS}) + string(REPLACE " " ";" SEARCH_LIBS ${IT}) + if (${_LIBRARIES}) + else () + check_fortran_libraries( + ${_LIBRARIES} + BLAS + ${BLAS_mkl_SEARCH_SYMBOL} + "${additional_flags}" + "${SEARCH_LIBS}" + "${OMP_LIB};${CMAKE_THREAD_LIBS_INIT};${LM}" + ) + if(_LIBRARIES) + set(BLAS_LINKER_FLAGS "${additional_flags}") + endif() + endif() + endforeach () + if(NOT BLAS_FIND_QUIETLY) + if(${_LIBRARIES}) + message(STATUS "Looking for MKL BLAS: found") + else() + message(STATUS "Looking for MKL BLAS: not found") + endif() + endif() + if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Intel MKL") + endif() + endif () + endif() +endif () + + +if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "goto2" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Goto BLAS: found") + else() + message(STATUS "Looking for Goto BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Goto") + endif() + +endif () + + +# OpenBlas +if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # openblas (http://www.openblas.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "openblas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Open BLAS: found") + else() + message(STATUS "Looking for Open BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Openblas") + endif() + +endif () + + +# EigenBlas +if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas_static" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Eigen") + endif() + +endif () + + +if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "f77blas;atlas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Atlas BLAS: found") + else() + message(STATUS "Looking for Atlas BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Atlas") + endif() + +endif () + + +# BLAS in PhiPACK libraries? (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "sgemm;dgemm;blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for PhiPACK BLAS: found") + else() + message(STATUS "Looking for PhiPACK BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "PhiPACK") + endif() + +endif () + + +# BLAS in Alpha CXML library? +if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "cxml" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for CXML BLAS: found") + else() + message(STATUS "Looking for CXML BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "CXML") + endif() + +endif () + + +# BLAS in Alpha DXML library? (now called CXML, see above) +if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "dxml" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for DXML BLAS: found") + else() + message(STATUS "Looking for DXML BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "DXML") + endif() + +endif () + + +# BLAS in Sun Performance library? +if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "-xlic_lib=sunperf" + "sunperf;sunmath" + "" + ) + if(BLAS_LIBRARIES) + set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + endif() + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SunPerf BLAS: found") + else() + message(STATUS "Looking for SunPerf BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SCSL library? (SGI/Cray Scientific Library) +if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "scsl" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SCSL BLAS: found") + else() + message(STATUS "Looking for SCSL BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SGIMATH library? +if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "complib.sgimath" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SGIMATH BLAS: found") + else() + message(STATUS "Looking for SGIMATH BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SGIMATH") + endif() + +endif () + + +# BLAS in IBM ESSL library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "essl;xlfmath;xlf90_r;blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL BLAS: found") + else() + message(STATUS "Looking for IBM ESSL BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL") + endif() + +endif () + +# BLAS in IBM ESSL_MT library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSLMT" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "esslsmp;xlsmp;xlfmath;xlf90_r;blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL MT BLAS: found") + else() + message(STATUS "Looking for IBM ESSL MT BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL MT") + endif() + +endif () + + +#BLAS in acml library? +if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") + + if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS))) + + # try to find acml in "standard" paths + if( WIN32 ) + file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" ) + else() + file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" ) + endif() + if( WIN32 ) + file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" ) + else() + file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" ) + endif() + list(GET _ACML_ROOT 0 _ACML_ROOT) + list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT) + + if( _ACML_ROOT ) + + get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH ) + if( SIZEOF_INTEGER EQUAL 8 ) + set( _ACML_PATH_SUFFIX "_int64" ) + else() + set( _ACML_PATH_SUFFIX "" ) + endif() + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + set( _ACML_COMPILER32 "ifort32" ) + set( _ACML_COMPILER64 "ifort64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) + set( _ACML_COMPILER32 "sun32" ) + set( _ACML_COMPILER64 "sun64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) + set( _ACML_COMPILER32 "pgi32" ) + if( WIN32 ) + set( _ACML_COMPILER64 "win64" ) + else() + set( _ACML_COMPILER64 "pgi64" ) + endif() + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" ) + # 32 bit builds not supported on Open64 but for code simplicity + # We'll just use the same directory twice + set( _ACML_COMPILER32 "open64_64" ) + set( _ACML_COMPILER64 "open64_64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" ) + set( _ACML_COMPILER32 "nag32" ) + set( _ACML_COMPILER64 "nag64" ) + else() + set( _ACML_COMPILER32 "gfortran32" ) + set( _ACML_COMPILER64 "gfortran64" ) + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + set(_ACML_MP_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" ) + else() + set(_ACML_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" ) + endif() + + endif() + + elseif(BLAS_${BLA_VENDOR}_LIB_DIRS) + + set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS}) + + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + elseif( BLA_VENDOR STREQUAL "ACML_GPU" ) + foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + else() + foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} ) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + endif() + + # Either acml or acml_mp should be in LD_LIBRARY_PATH but not both + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml;acml_mv" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() + endif() + endif() + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml_mp;acml_mv" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() + endif() + endif() + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "acml;acml_mv;CALBLAS" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "ACML") + endif() + +endif () # ACML + + +# Apple BLAS library? +if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "Accelerate" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Apple BLAS: found") + else() + message(STATUS "Looking for Apple BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Apple Accelerate") + endif() + +endif () + + +if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + + if ( NOT BLAS_LIBRARIES ) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "vecLib" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for NAS BLAS: found") + else() + message(STATUS "Looking for NAS BLAS: not found") + endif() + endif() + endif () + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "NAS") + endif() + +endif () + + +# Generic BLAS library? +if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + + set(BLAS_SEARCH_LIBS "blas;blas_LINUX;blas_MAC;blas_WINDOWS;refblas") + foreach (SEARCH_LIB ${BLAS_SEARCH_LIBS}) + if (BLAS_LIBRARIES) + else () + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "${SEARCH_LIB}" + "${LGFORTRAN}" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Generic BLAS: found") + else() + message(STATUS "Looking for Generic BLAS: not found") + endif() + endif() + endif() + endforeach () + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas") + endif() + +endif () + + +if(BLA_F95) + + if(BLAS95_LIBRARIES) + set(BLAS95_FOUND TRUE) + else() + set(BLAS95_FOUND FALSE) + endif() + + if(NOT BLAS_FIND_QUIETLY) + if(BLAS95_FOUND) + message(STATUS "A library with BLAS95 API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") + else() + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with BLAS95 API not found. Please specify library location.") + else() + message(STATUS + "A library with BLAS95 API not found. Please specify library location.") + endif() + endif() + endif() + + set(BLAS_FOUND TRUE) + set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}") + +else() + + if(BLAS_LIBRARIES) + set(BLAS_FOUND TRUE) + else() + set(BLAS_FOUND FALSE) + endif() + + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_FOUND) + message(STATUS "A library with BLAS API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") + else() + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with BLAS API not found. Please specify library location.") + else() + message(STATUS + "A library with BLAS API not found. Please specify library location.") + endif() + endif() + endif() + +endif() + +set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) + +if (BLAS_FOUND) + list(GET BLAS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") + string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") + set(BLAS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of BLAS library" FORCE) + else() + set(BLAS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of BLAS library" FORCE) + endif() +endif() +mark_as_advanced(BLAS_DIR) +mark_as_advanced(BLAS_DIR_FOUND) diff --git a/externals/eigen/cmake/FindBLASEXT.cmake b/externals/eigen/cmake/FindBLASEXT.cmake new file mode 100644 index 00000000..69a94189 --- /dev/null +++ b/externals/eigen/cmake/FindBLASEXT.cmake @@ -0,0 +1,384 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS EXTENDED for MORSE projects: find include dirs and libraries +# +# This module allows to find BLAS libraries by calling the official FindBLAS module +# and handles the creation of different library lists whether the user wishes to link +# with a sequential BLAS or a multihreaded (BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES). +# BLAS is detected with a FindBLAS call then if the BLAS vendor is Intel10_64lp, ACML +# or IBMESSLMT then the module attempts to find the corresponding multithreaded libraries. +# +# The following variables have been added to manage links with sequential or multithreaded +# versions: +# BLAS_INCLUDE_DIRS - BLAS include directories +# BLAS_LIBRARY_DIRS - Link directories for BLAS libraries +# BLAS_SEQ_LIBRARIES - BLAS component libraries to be linked (sequential) +# BLAS_PAR_LIBRARIES - BLAS component libraries to be linked (multithreaded) + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +# macro to factorize this call +include(CMakeFindDependencyMacro) +macro(find_package_blas) + if(BLASEXT_FIND_REQUIRED) + if(BLASEXT_FIND_QUIETLY) + find_dependency(BLAS REQUIRED QUIET) + else() + find_dependency(BLAS REQUIRED) + endif() + else() + if(BLASEXT_FIND_QUIETLY) + find_dependency(BLAS QUIET) + else() + find_dependency(BLAS) + endif() + endif() +endmacro() + +# add a cache variable to let the user specify the BLAS vendor +set(BLA_VENDOR "" CACHE STRING "list of possible BLAS vendor: + Open, Eigen, Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, + Intel10_32 (intel mkl v10 32 bit), + Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), + Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model), + Intel( older versions of mkl 32 and 64 bit), + ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + +if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "In FindBLASEXT") + message(STATUS "If you want to force the use of one specific library, " + "\n please specify the BLAS vendor by setting -DBLA_VENDOR=blas_vendor_name" + "\n at cmake configure.") + message(STATUS "List of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, " + "\n DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, Intel10_32 (intel mkl v10 32 bit)," + "\n Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model)," + "\n Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "\n Intel( older versions of mkl 32 and 64 bit)," + "\n ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") +endif() + +if (NOT BLAS_FOUND) + # First try to detect two cases: + # 1: only SEQ libs are handled + # 2: both SEQ and PAR libs are handled + find_package_blas() +endif () + +# detect the cases where SEQ and PAR libs are handled +if(BLA_VENDOR STREQUAL "All" AND + (BLAS_mkl_core_LIBRARY OR BLAS_mkl_core_dll_LIBRARY) + ) + set(BLA_VENDOR "Intel") + if(BLAS_mkl_intel_LIBRARY) + set(BLA_VENDOR "Intel10_32") + endif() + if(BLAS_mkl_intel_lp64_LIBRARY) + set(BLA_VENDOR "Intel10_64lp") + endif() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the MKL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_acml_LIBRARY) + set(BLA_VENDOR "ACML") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ACML." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_essl_LIBRARY) + set(BLA_VENDOR "IBMESSL") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ESSL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +endif() + +# Intel case +if(BLA_VENDOR MATCHES "Intel*") + + ### + # look for include path if the BLAS vendor is Intel + ### + + # gather system include paths + unset(_inc_env) + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + set(ENV_MKLROOT "$ENV{MKLROOT}") + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + list(REMOVE_DUPLICATES _inc_env) + + # find mkl.h inside known include paths + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + if(BLAS_INCDIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES include) + else() + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${_inc_env}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_INCLUDE_DIRS) + ## Print status if not found + ## ------------------------- + #if (NOT BLAS_mkl.h_INCLUDE_DIRS AND MORSE_VERBOSE) + # Print_Find_Header_Status(blas mkl.h) + #endif () + set(BLAS_INCLUDE_DIRS "") + if(BLAS_mkl.h_INCLUDE_DIRS) + list(APPEND BLAS_INCLUDE_DIRS "${BLAS_mkl.h_INCLUDE_DIRS}" ) + endif() + + ### + # look for libs + ### + # if Intel 10 64 bit -> look for sequential and multithreaded versions + if(BLA_VENDOR MATCHES "Intel10_64lp*") + + ## look for the sequential version + set(BLA_VENDOR "Intel10_64lp_seq") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the sequential version Intel10_64lp_seq") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "Intel10_64lp") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the multithreaded version Intel10_64lp") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + else() + + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + endif() + + # ACML case +elseif(BLA_VENDOR MATCHES "ACML*") + + ## look for the sequential version + set(BLA_VENDOR "ACML") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "ACML_MP") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + # IBMESSL case +elseif(BLA_VENDOR MATCHES "IBMESSL*") + + ## look for the sequential version + set(BLA_VENDOR "IBMESSL") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "IBMESSLMT") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + +else() + + if(BLAS_FOUND) + # define the SEQ libs as the BLAS_LIBRARIES + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + +endif() + + +if(BLAS_SEQ_LIBRARIES) + set(BLAS_LIBRARIES "${BLAS_SEQ_LIBRARIES}") +endif() + +# extract libs paths +# remark: because it is not given by find_package(BLAS) +set(BLAS_LIBRARY_DIRS "") +string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}") +foreach(blas_lib ${BLAS_LIBRARIES}) + if (EXISTS "${blas_lib}") + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + else() + string(REPLACE "-L" "" blas_lib "${blas_lib}") + if (EXISTS "${blas_lib}") + list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" ) + else() + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + if (EXISTS "${a_blas_lib_dir}") + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + endif() + endif() + endif() +endforeach() +if (BLAS_LIBRARY_DIRS) + list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS) +endif () + +# check that BLAS has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +if(BLA_VENDOR MATCHES "Intel*") + if(BLA_VENDOR MATCHES "Intel10_64lp*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is Intel MKL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() + else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + endif() +elseif(BLA_VENDOR MATCHES "ACML*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ACML:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +elseif(BLA_VENDOR MATCHES "IBMESSL*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ESSL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLASEXT DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) +endif() + +# Callers expect BLAS_FOUND to be set as well. +set(BLAS_FOUND BLASEXT_FOUND) diff --git a/externals/eigen/cmake/FindCHOLMOD.cmake b/externals/eigen/cmake/FindCHOLMOD.cmake new file mode 100644 index 00000000..e470cb2e --- /dev/null +++ b/externals/eigen/cmake/FindCHOLMOD.cmake @@ -0,0 +1,89 @@ +# CHOLMOD lib usually requires linking to a blas and lapack library. +# It is up to the user of this module to find a BLAS and link to it. + +if (CHOLMOD_INCLUDES AND CHOLMOD_LIBRARIES) + set(CHOLMOD_FIND_QUIETLY TRUE) +endif () + +find_path(CHOLMOD_INCLUDES + NAMES + cholmod.h + PATHS + $ENV{CHOLMODDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +find_library(CHOLMOD_LIBRARIES cholmod PATHS $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + +if(CHOLMOD_LIBRARIES) + + get_filename_component(CHOLMOD_LIBDIR ${CHOLMOD_LIBRARIES} PATH) + + find_library(AMD_LIBRARY amd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (AMD_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${AMD_LIBRARY}) + else () + set(CHOLMOD_LIBRARIES FALSE) + endif () + +endif() + +if(CHOLMOD_LIBRARIES) + + find_library(COLAMD_LIBRARY colamd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (COLAMD_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${COLAMD_LIBRARY}) + else () + set(CHOLMOD_LIBRARIES FALSE) + endif () + +endif() + +if(CHOLMOD_LIBRARIES) + + find_library(CAMD_LIBRARY camd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (CAMD_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CAMD_LIBRARY}) + else () + set(CHOLMOD_LIBRARIES FALSE) + endif () + +endif() + +if(CHOLMOD_LIBRARIES) + + find_library(CCOLAMD_LIBRARY ccolamd PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (CCOLAMD_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CCOLAMD_LIBRARY}) + else () + set(CHOLMOD_LIBRARIES FALSE) + endif () + +endif() + +if(CHOLMOD_LIBRARIES) + + find_library(CHOLMOD_METIS_LIBRARY metis PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (CHOLMOD_METIS_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${CHOLMOD_METIS_LIBRARY}) + endif () + +endif() + +if(CHOLMOD_LIBRARIES) + + find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${CHOLMOD_LIBDIR} $ENV{CHOLMODDIR} ${LIB_INSTALL_DIR}) + if (SUITESPARSE_LIBRARY) + set(CHOLMOD_LIBRARIES ${CHOLMOD_LIBRARIES} ${SUITESPARSE_LIBRARY}) + endif () + +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CHOLMOD DEFAULT_MSG + CHOLMOD_INCLUDES CHOLMOD_LIBRARIES) + +mark_as_advanced(CHOLMOD_INCLUDES CHOLMOD_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY SUITESPARSE_LIBRARY CAMD_LIBRARY CCOLAMD_LIBRARY CHOLMOD_METIS_LIBRARY) diff --git a/externals/eigen/cmake/FindComputeCpp.cmake b/externals/eigen/cmake/FindComputeCpp.cmake new file mode 100644 index 00000000..1c271f0f --- /dev/null +++ b/externals/eigen/cmake/FindComputeCpp.cmake @@ -0,0 +1,455 @@ +#.rst: +# FindComputeCpp +#--------------- +# +# Copyright 2016-2018 Codeplay Software Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use these files except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +######################### +# FindComputeCpp.cmake +######################### +# +# Tools for finding and building with ComputeCpp. +# +# User must define ComputeCpp_DIR pointing to the ComputeCpp +# installation. +# +# Latest version of this file can be found at: +# https://github.com/codeplaysoftware/computecpp-sdk + +cmake_minimum_required(VERSION 3.4.3) +include(FindPackageHandleStandardArgs) +include(ComputeCppIRMap) + +set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++") +separate_arguments(COMPUTECPP_USER_FLAGS) +mark_as_advanced(COMPUTECPP_USER_FLAGS) + +set(COMPUTECPP_BITCODE "spir64" CACHE STRING + "Bitcode type to use as SYCL target in compute++") +mark_as_advanced(COMPUTECPP_BITCODE) + +include(CMakeFindDependencyMacro) +find_dependency(OpenCL REQUIRED) + +# Find ComputeCpp package + +if(DEFINED ComputeCpp_DIR) + set(computecpp_find_hint ${ComputeCpp_DIR}) +elseif(DEFINED ENV{COMPUTECPP_DIR}) + set(computecpp_find_hint $ENV{COMPUTECPP_DIR}) +endif() + +# Used for running executables on the host +set(computecpp_host_find_hint ${computecpp_find_hint}) + +if(CMAKE_CROSSCOMPILING) + # ComputeCpp_HOST_DIR is used to find executables that are run on the host + if(DEFINED ComputeCpp_HOST_DIR) + set(computecpp_host_find_hint ${ComputeCpp_HOST_DIR}) + elseif(DEFINED ENV{COMPUTECPP_HOST_DIR}) + set(computecpp_host_find_hint $ENV{COMPUTECPP_HOST_DIR}) + endif() +endif() + +find_program(ComputeCpp_DEVICE_COMPILER_EXECUTABLE compute++ + HINTS ${computecpp_host_find_hint} + PATH_SUFFIXES bin + NO_SYSTEM_ENVIRONMENT_PATH) + +find_program(ComputeCpp_INFO_EXECUTABLE computecpp_info + HINTS ${computecpp_host_find_hint} + PATH_SUFFIXES bin + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(COMPUTECPP_RUNTIME_LIBRARY + NAMES ComputeCpp ComputeCpp_vs2015 + HINTS ${computecpp_find_hint} + PATH_SUFFIXES lib + DOC "ComputeCpp Runtime Library") + +find_library(COMPUTECPP_RUNTIME_LIBRARY_DEBUG + NAMES ComputeCpp_d ComputeCpp ComputeCpp_vs2015_d + HINTS ${computecpp_find_hint} + PATH_SUFFIXES lib + DOC "ComputeCpp Debug Runtime Library") + +find_path(ComputeCpp_INCLUDE_DIRS + NAMES "CL/sycl.hpp" + HINTS ${computecpp_find_hint}/include + DOC "The ComputeCpp include directory") +get_filename_component(ComputeCpp_INCLUDE_DIRS ${ComputeCpp_INCLUDE_DIRS} ABSOLUTE) + +get_filename_component(computecpp_canonical_root_dir "${ComputeCpp_INCLUDE_DIRS}/.." ABSOLUTE) +set(ComputeCpp_ROOT_DIR "${computecpp_canonical_root_dir}" CACHE PATH + "The root of the ComputeCpp install") + +if(NOT ComputeCpp_INFO_EXECUTABLE) + message(WARNING "Can't find computecpp_info - check ComputeCpp_DIR") +else() + execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-version" + OUTPUT_VARIABLE ComputeCpp_VERSION + RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0") + message(WARNING "Package version - Error obtaining version!") + endif() + + execute_process(COMMAND ${ComputeCpp_INFO_EXECUTABLE} "--dump-is-supported" + OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED + RESULT_VARIABLE ComputeCpp_INFO_EXECUTABLE_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT ComputeCpp_INFO_EXECUTABLE_RESULT EQUAL "0") + message(WARNING "platform - Error checking platform support!") + else() + mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) + if (COMPUTECPP_PLATFORM_IS_SUPPORTED) + message(STATUS "platform - your system can support ComputeCpp") + else() + message(STATUS "platform - your system is not officially supported") + endif() + endif() +endif() + +find_package_handle_standard_args(ComputeCpp + REQUIRED_VARS ComputeCpp_ROOT_DIR + ComputeCpp_DEVICE_COMPILER_EXECUTABLE + ComputeCpp_INFO_EXECUTABLE + COMPUTECPP_RUNTIME_LIBRARY + COMPUTECPP_RUNTIME_LIBRARY_DEBUG + ComputeCpp_INCLUDE_DIRS + VERSION_VAR ComputeCpp_VERSION) +mark_as_advanced(ComputeCpp_ROOT_DIR + ComputeCpp_DEVICE_COMPILER_EXECUTABLE + ComputeCpp_INFO_EXECUTABLE + COMPUTECPP_RUNTIME_LIBRARY + COMPUTECPP_RUNTIME_LIBRARY_DEBUG + ComputeCpp_INCLUDE_DIRS + ComputeCpp_VERSION) + +if(NOT ComputeCpp_FOUND) + return() +endif() + +list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -O2 -mllvm -inline-threshold=1000 -intelspirmetadata) +mark_as_advanced(COMPUTECPP_DEVICE_COMPILER_FLAGS) + +if(CMAKE_CROSSCOMPILING) + if(NOT COMPUTECPP_DONT_USE_TOOLCHAIN) + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --gcc-toolchain=${COMPUTECPP_TOOLCHAIN_DIR}) + endif() + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS --sysroot=${COMPUTECPP_SYSROOT_DIR}) + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -target ${COMPUTECPP_TARGET_TRIPLE}) +endif() + +list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -sycl-target ${COMPUTECPP_BITCODE}) +message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") + +include(ComputeCppCompilerChecks) + +if(NOT TARGET OpenCL::OpenCL) + add_library(OpenCL::OpenCL UNKNOWN IMPORTED) + set_target_properties(OpenCL::OpenCL PROPERTIES + IMPORTED_LOCATION "${OpenCL_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${OpenCL_INCLUDE_DIRS}" + ) +endif() + +if(NOT TARGET ComputeCpp::ComputeCpp) + add_library(ComputeCpp::ComputeCpp UNKNOWN IMPORTED) + set_target_properties(ComputeCpp::ComputeCpp PROPERTIES + IMPORTED_LOCATION_DEBUG "${COMPUTECPP_RUNTIME_LIBRARY_DEBUG}" + IMPORTED_LOCATION_RELWITHDEBINFO "${COMPUTECPP_RUNTIME_LIBRARY}" + IMPORTED_LOCATION "${COMPUTECPP_RUNTIME_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ComputeCpp_INCLUDE_DIRS}" + INTERFACE_LINK_LIBRARIES "OpenCL::OpenCL" + ) +endif() + +# This property allows targets to specify that their sources should be +# compiled with the integration header included after the user's +# sources, not before (e.g. when an enum is used in a kernel name, this +# is not technically valid SYCL code but can work with ComputeCpp) +define_property( + TARGET PROPERTY COMPUTECPP_INCLUDE_AFTER + BRIEF_DOCS "Include integration header after user source" + FULL_DOCS "Changes compiler arguments such that the source file is + actually the integration header, and the .cpp file is included on + the command line so that it is seen by the compiler first. Enables + non-standards-conformant SYCL code to compile with ComputeCpp." +) +define_property( + TARGET PROPERTY INTERFACE_COMPUTECPP_FLAGS + BRIEF_DOCS "Interface compile flags to provide compute++" + FULL_DOCS "Set additional compile flags to pass to compute++ when compiling + any target which links to this one." +) +define_property( + SOURCE PROPERTY COMPUTECPP_SOURCE_FLAGS + BRIEF_DOCS "Source file compile flags for compute++" + FULL_DOCS "Set additional compile flags for compiling the SYCL integration + header for the given source file." +) + +#################### +# __build_ir +#################### +# +# Adds a custom target for running compute++ and adding a dependency for the +# resulting integration header and kernel binary. +# +# TARGET : Name of the target. +# SOURCE : Source file to be compiled. +# COUNTER : Counter included in name of custom target. Different counter +# values prevent duplicated names of custom target when source files with +# the same name, but located in different directories, are used for the +# same target. +# +function(__build_ir) + set(options) + set(one_value_args + TARGET + SOURCE + COUNTER + ) + set(multi_value_args) + cmake_parse_arguments(SDK_BUILD_IR + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) + get_filename_component(sourceFileName ${SDK_BUILD_IR_SOURCE} NAME) + + # Set the path to the integration header. + # The .sycl filename must depend on the target so that different targets + # using the same source file will be generated with a different rule. + set(baseSyclName ${CMAKE_CURRENT_BINARY_DIR}/${SDK_BUILD_IR_TARGET}_${sourceFileName}) + set(outputSyclFile ${baseSyclName}.sycl) + set(outputDeviceFile ${baseSyclName}.${IR_MAP_${COMPUTECPP_BITCODE}}) + set(depFileName ${baseSyclName}.sycl.d) + + set(include_directories "$") + set(compile_definitions "$") + set(generated_include_directories + $<$:-I\"$\">) + set(generated_compile_definitions + $<$:-D$>) + + # Obtain language standard of the file + set(device_compiler_cxx_standard) + get_target_property(targetCxxStandard ${SDK_BUILD_IR_TARGET} CXX_STANDARD) + if (targetCxxStandard MATCHES 17) + set(device_compiler_cxx_standard "-std=c++1z") + elseif (targetCxxStandard MATCHES 14) + set(device_compiler_cxx_standard "-std=c++14") + elseif (targetCxxStandard MATCHES 11) + set(device_compiler_cxx_standard "-std=c++11") + elseif (targetCxxStandard MATCHES 98) + message(FATAL_ERROR "SYCL applications cannot be compiled using C++98") + else () + set(device_compiler_cxx_standard "") + endif() + + get_property(source_compile_flags + SOURCE ${SDK_BUILD_IR_SOURCE} + PROPERTY COMPUTECPP_SOURCE_FLAGS + ) + separate_arguments(source_compile_flags) + if(source_compile_flags) + list(APPEND computecpp_source_flags ${source_compile_flags}) + endif() + + list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS + ${device_compiler_cxx_standard} + ${COMPUTECPP_USER_FLAGS} + ${computecpp_source_flags} + ) + + set(ir_dependencies ${SDK_BUILD_IR_SOURCE}) + get_target_property(target_libraries ${SDK_BUILD_IR_TARGET} LINK_LIBRARIES) + if(target_libraries) + foreach(library ${target_libraries}) + if(TARGET ${library}) + list(APPEND ir_dependencies ${library}) + endif() + endforeach() + endif() + + # Depfile support was only added in CMake 3.7 + # CMake throws an error if it is unsupported by the generator (i. e. not ninja) + if((NOT CMAKE_VERSION VERSION_LESS 3.7.0) AND + CMAKE_GENERATOR MATCHES "Ninja") + file(RELATIVE_PATH relOutputFile ${CMAKE_BINARY_DIR} ${outputDeviceFile}) + set(generate_depfile -MMD -MF ${depFileName} -MT ${relOutputFile}) + set(enable_depfile DEPFILE ${depFileName}) + endif() + + # Add custom command for running compute++ + add_custom_command( + OUTPUT ${outputDeviceFile} ${outputSyclFile} + COMMAND ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + ${generated_include_directories} + ${generated_compile_definitions} + -sycl-ih ${outputSyclFile} + -o ${outputDeviceFile} + -c ${SDK_BUILD_IR_SOURCE} + ${generate_depfile} + DEPENDS ${ir_dependencies} + IMPLICIT_DEPENDS CXX ${SDK_BUILD_IR_SOURCE} + ${enable_depfile} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") + + # Name: (user-defined name)_(source file)_(counter)_ih + set(headerTargetName + ${SDK_BUILD_IR_TARGET}_${sourceFileName}_${SDK_BUILD_IR_COUNTER}_ih) + + if(NOT MSVC) + # Add a custom target for the generated integration header + add_custom_target(${headerTargetName} DEPENDS ${outputDeviceFile} ${outputSyclFile}) + add_dependencies(${SDK_BUILD_IR_TARGET} ${headerTargetName}) + endif() + + # This property can be set on a per-target basis to indicate that the + # integration header should appear after the main source listing + get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER) + + if(includeAfter) + # Change the source file to the integration header - e.g. + # g++ -c source_file_name.cpp.sycl + get_target_property(current_sources ${SDK_BUILD_IR_TARGET} SOURCES) + # Remove absolute path to source file + list(REMOVE_ITEM current_sources ${SDK_BUILD_IR_SOURCE}) + # Remove relative path to source file + string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" + rel_source_file ${SDK_BUILD_IR_SOURCE} + ) + list(REMOVE_ITEM current_sources ${rel_source_file}) + # Add SYCL header to source list + list(APPEND current_sources ${outputSyclFile}) + set_property(TARGET ${SDK_BUILD_IR_TARGET} + PROPERTY SOURCES ${current_sources}) + # CMake/gcc don't know what language a .sycl file is, so tell them + set_property(SOURCE ${outputSyclFile} PROPERTY LANGUAGE CXX) + set(includedFile ${SDK_BUILD_IR_SOURCE}) + set(cppFile ${outputSyclFile}) + else() + set_property(SOURCE ${outputSyclFile} PROPERTY HEADER_FILE_ONLY ON) + set(includedFile ${outputSyclFile}) + set(cppFile ${SDK_BUILD_IR_SOURCE}) + endif() + + # Force inclusion of the integration header for the host compiler + if(MSVC) + # Group SYCL files inside Visual Studio + source_group("SYCL" FILES ${outputSyclFile}) + + if(includeAfter) + # Allow the source file to be edited using Visual Studio. + # It will be added as a header file so it won't be compiled. + set_property(SOURCE ${SDK_BUILD_IR_SOURCE} PROPERTY HEADER_FILE_ONLY true) + endif() + + # Add both source and the sycl files to the VS solution. + target_sources(${SDK_BUILD_IR_TARGET} PUBLIC ${SDK_BUILD_IR_SOURCE} ${outputSyclFile}) + + set(forceIncludeFlags "/FI${includedFile} /TP") + else() + set(forceIncludeFlags "-include ${includedFile} -x c++") + endif() + + set_property( + SOURCE ${cppFile} + APPEND_STRING PROPERTY COMPILE_FLAGS "${forceIncludeFlags}" + ) + +endfunction(__build_ir) + +####################### +# add_sycl_to_target +####################### +# +# Adds a SYCL compilation custom command associated with an existing +# target and sets a dependancy on that new command. +# +# TARGET : Name of the target to add SYCL to. +# SOURCES : Source files to be compiled for SYCL. +# +function(add_sycl_to_target) + set(options) + set(one_value_args + TARGET + ) + set(multi_value_args + SOURCES + ) + cmake_parse_arguments(SDK_ADD_SYCL + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) + + set_target_properties(${SDK_ADD_SYCL_TARGET} PROPERTIES LINKER_LANGUAGE CXX) + + # If the CXX compiler is set to compute++ enable the driver. + get_filename_component(cmakeCxxCompilerFileName "${CMAKE_CXX_COMPILER}" NAME) + if("${cmakeCxxCompilerFileName}" STREQUAL "compute++") + if(MSVC) + message(FATAL_ERROR "The compiler driver is not supported by this system, + revert the CXX compiler to your default host compiler.") + endif() + + get_target_property(includeAfter ${SDK_ADD_SYCL_TARGET} COMPUTECPP_INCLUDE_AFTER) + if(includeAfter) + list(APPEND COMPUTECPP_USER_FLAGS -fsycl-ih-last) + endif() + list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl-driver) + # Prepend COMPUTECPP_DEVICE_COMPILER_FLAGS and append COMPUTECPP_USER_FLAGS + foreach(prop COMPILE_OPTIONS INTERFACE_COMPILE_OPTIONS) + get_target_property(target_compile_options ${SDK_ADD_SYCL_TARGET} ${prop}) + if(NOT target_compile_options) + set(target_compile_options "") + endif() + set_property( + TARGET ${SDK_ADD_SYCL_TARGET} + PROPERTY ${prop} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + ${target_compile_options} + ${COMPUTECPP_USER_FLAGS} + ) + endforeach() + else() + set(fileCounter 0) + list(INSERT COMPUTECPP_DEVICE_COMPILER_FLAGS 0 -sycl) + # Add custom target to run compute++ and generate the integration header + foreach(sourceFile ${SDK_ADD_SYCL_SOURCES}) + if(NOT IS_ABSOLUTE ${sourceFile}) + set(sourceFile "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}") + endif() + __build_ir( + TARGET ${SDK_ADD_SYCL_TARGET} + SOURCE ${sourceFile} + COUNTER ${fileCounter} + ) + MATH(EXPR fileCounter "${fileCounter} + 1") + endforeach() + endif() + + set_property(TARGET ${SDK_ADD_SYCL_TARGET} + APPEND PROPERTY LINK_LIBRARIES ComputeCpp::ComputeCpp) + set_property(TARGET ${SDK_ADD_SYCL_TARGET} + APPEND PROPERTY INTERFACE_LINK_LIBRARIES ComputeCpp::ComputeCpp) +endfunction(add_sycl_to_target) diff --git a/externals/eigen/cmake/FindEigen2.cmake b/externals/eigen/cmake/FindEigen2.cmake new file mode 100644 index 00000000..eb2709dc --- /dev/null +++ b/externals/eigen/cmake/FindEigen2.cmake @@ -0,0 +1,80 @@ +# - Try to find Eigen2 lib +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(Eigen2 2.0.3) +# to require version 2.0.3 to newer of Eigen2. +# +# Once done this will define +# +# EIGEN2_FOUND - system has eigen lib with correct version +# EIGEN2_INCLUDE_DIR - the eigen include directory +# EIGEN2_VERSION - eigen version + +# Copyright (c) 2006, 2007 Montel Laurent, +# Copyright (c) 2008, 2009 Gael Guennebaud, +# Redistribution and use is allowed according to the terms of the BSD license. + +if(NOT Eigen2_FIND_VERSION) + if(NOT Eigen2_FIND_VERSION_MAJOR) + set(Eigen2_FIND_VERSION_MAJOR 2) + endif() + if(NOT Eigen2_FIND_VERSION_MINOR) + set(Eigen2_FIND_VERSION_MINOR 0) + endif() + if(NOT Eigen2_FIND_VERSION_PATCH) + set(Eigen2_FIND_VERSION_PATCH 0) + endif() + + set(Eigen2_FIND_VERSION "${Eigen2_FIND_VERSION_MAJOR}.${Eigen2_FIND_VERSION_MINOR}.${Eigen2_FIND_VERSION_PATCH}") +endif() + +macro(_eigen2_check_version) + file(READ "${EIGEN2_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen2_version_header) + + string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen2_world_version_match "${_eigen2_version_header}") + set(EIGEN2_WORLD_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen2_major_version_match "${_eigen2_version_header}") + set(EIGEN2_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen2_minor_version_match "${_eigen2_version_header}") + set(EIGEN2_MINOR_VERSION "${CMAKE_MATCH_1}") + + set(EIGEN2_VERSION ${EIGEN2_WORLD_VERSION}.${EIGEN2_MAJOR_VERSION}.${EIGEN2_MINOR_VERSION}) + if((${EIGEN2_WORLD_VERSION} NOTEQUAL 2) OR (${EIGEN2_MAJOR_VERSION} GREATER 10) OR (${EIGEN2_VERSION} VERSION_LESS ${Eigen2_FIND_VERSION})) + set(EIGEN2_VERSION_OK FALSE) + else() + set(EIGEN2_VERSION_OK TRUE) + endif() + + if(NOT EIGEN2_VERSION_OK) + + message(STATUS "Eigen2 version ${EIGEN2_VERSION} found in ${EIGEN2_INCLUDE_DIR}, " + "but at least version ${Eigen2_FIND_VERSION} is required") + endif() +endmacro() + +if (EIGEN2_INCLUDE_DIR) + + # in cache already + _eigen2_check_version() + set(EIGEN2_FOUND ${EIGEN2_VERSION_OK}) + +else () + +find_path(EIGEN2_INCLUDE_DIR NAMES Eigen/Core + PATHS + ${INCLUDE_INSTALL_DIR} + ${KDE4_INCLUDE_DIR} + PATH_SUFFIXES eigen2 + ) + +if(EIGEN2_INCLUDE_DIR) + _eigen2_check_version() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Eigen2 DEFAULT_MSG EIGEN2_INCLUDE_DIR EIGEN2_VERSION_OK) + +mark_as_advanced(EIGEN2_INCLUDE_DIR) + +endif() + diff --git a/externals/eigen/cmake/FindEigen3.cmake b/externals/eigen/cmake/FindEigen3.cmake new file mode 100644 index 00000000..0b36805e --- /dev/null +++ b/externals/eigen/cmake/FindEigen3.cmake @@ -0,0 +1,107 @@ +# - Try to find Eigen3 lib +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(Eigen3 3.1.2) +# to require version 3.1.2 or newer of Eigen3. +# +# Once done this will define +# +# EIGEN3_FOUND - system has eigen lib with correct version +# EIGEN3_INCLUDE_DIR - the eigen include directory +# EIGEN3_VERSION - eigen version +# +# and the following imported target: +# +# Eigen3::Eigen - The header-only Eigen library +# +# This module reads hints about search locations from +# the following environment variables: +# +# EIGEN3_ROOT +# EIGEN3_ROOT_DIR + +# Copyright (c) 2006, 2007 Montel Laurent, +# Copyright (c) 2008, 2009 Gael Guennebaud, +# Copyright (c) 2009 Benoit Jacob +# Redistribution and use is allowed according to the terms of the 2-clause BSD license. + +if(NOT Eigen3_FIND_VERSION) + if(NOT Eigen3_FIND_VERSION_MAJOR) + set(Eigen3_FIND_VERSION_MAJOR 2) + endif() + if(NOT Eigen3_FIND_VERSION_MINOR) + set(Eigen3_FIND_VERSION_MINOR 91) + endif() + if(NOT Eigen3_FIND_VERSION_PATCH) + set(Eigen3_FIND_VERSION_PATCH 0) + endif() + + set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") +endif() + +macro(_eigen3_check_version) + file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) + + string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") + set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") + set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") + set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") + + set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) + if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) + set(EIGEN3_VERSION_OK FALSE) + else() + set(EIGEN3_VERSION_OK TRUE) + endif() + + if(NOT EIGEN3_VERSION_OK) + + message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " + "but at least version ${Eigen3_FIND_VERSION} is required") + endif() +endmacro() + +if (EIGEN3_INCLUDE_DIR) + + # in cache already + _eigen3_check_version() + set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) + set(Eigen3_FOUND ${EIGEN3_VERSION_OK}) + +else () + + # search first if an Eigen3Config.cmake is available in the system, + # if successful this would set EIGEN3_INCLUDE_DIR and the rest of + # the script will work as usual + find_package(Eigen3 ${Eigen3_FIND_VERSION} NO_MODULE QUIET) + + if(NOT EIGEN3_INCLUDE_DIR) + find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library + HINTS + ENV EIGEN3_ROOT + ENV EIGEN3_ROOT_DIR + PATHS + ${CMAKE_INSTALL_PREFIX}/include + ${KDE4_INCLUDE_DIR} + PATH_SUFFIXES eigen3 eigen + ) + endif() + + if(EIGEN3_INCLUDE_DIR) + _eigen3_check_version() + endif() + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) + + mark_as_advanced(EIGEN3_INCLUDE_DIR) + +endif() + +if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen) + add_library(Eigen3::Eigen INTERFACE IMPORTED) + set_target_properties(Eigen3::Eigen PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}") +endif() diff --git a/externals/eigen/cmake/FindFFTW.cmake b/externals/eigen/cmake/FindFFTW.cmake new file mode 100644 index 00000000..ed55c5fa --- /dev/null +++ b/externals/eigen/cmake/FindFFTW.cmake @@ -0,0 +1,120 @@ +# - Find the FFTW library +# +# Usage: +# find_package(FFTW [REQUIRED] [QUIET] ) +# +# It sets the following variables: +# FFTW_FOUND ... true if fftw is found on the system +# FFTW_LIBRARIES ... full path to fftw library +# FFTW_INCLUDES ... fftw include directory +# +# The following variables will be checked by the function +# FFTW_USE_STATIC_LIBS ... if true, only static libraries are found +# FFTW_ROOT ... if set, the libraries are exclusively searched +# under this path +# FFTW_LIBRARY ... fftw library to use +# FFTW_INCLUDE_DIR ... fftw include directory +# + +#If environment variable FFTWDIR is specified, it has same effect as FFTW_ROOT +if( NOT FFTW_ROOT AND ENV{FFTWDIR} ) + set( FFTW_ROOT $ENV{FFTWDIR} ) +endif() + +# Check if we can use PkgConfig +include(CMakeFindDependencyMacro) +find_dependency(PkgConfig) + +#Determine from PKG +if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT ) + pkg_check_modules( PKG_FFTW QUIET "fftw3" ) +endif() + +#Check whether to search static or dynamic libs +set( CMAKE_FIND_LIBRARY_SUFFIXES_SAV ${CMAKE_FIND_LIBRARY_SUFFIXES} ) + +if( ${FFTW_USE_STATIC_LIBS} ) + set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX} ) +else() + set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_SHARED_LIBRARY_SUFFIX} ) +endif() + +if( FFTW_ROOT ) + + #find libs + find_library( + FFTW_LIB + NAMES "fftw3" + PATHS ${FFTW_ROOT} + PATH_SUFFIXES "lib" "lib64" + NO_DEFAULT_PATH + ) + + find_library( + FFTWF_LIB + NAMES "fftw3f" + PATHS ${FFTW_ROOT} + PATH_SUFFIXES "lib" "lib64" + NO_DEFAULT_PATH + ) + + find_library( + FFTWL_LIB + NAMES "fftw3l" + PATHS ${FFTW_ROOT} + PATH_SUFFIXES "lib" "lib64" + NO_DEFAULT_PATH + ) + + #find includes + find_path( + FFTW_INCLUDES + NAMES "fftw3.h" + PATHS ${FFTW_ROOT} + PATH_SUFFIXES "include" + NO_DEFAULT_PATH + ) + +else() + + find_library( + FFTW_LIB + NAMES "fftw3" + PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + ) + + find_library( + FFTWF_LIB + NAMES "fftw3f" + PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + ) + + + find_library( + FFTWL_LIB + NAMES "fftw3l" + PATHS ${PKG_FFTW_LIBRARY_DIRS} ${LIB_INSTALL_DIR} + ) + + find_path( + FFTW_INCLUDES + NAMES "fftw3.h" + PATHS ${PKG_FFTW_INCLUDE_DIRS} ${INCLUDE_INSTALL_DIR} + ) + +endif() + +set(FFTW_LIBRARIES ${FFTW_LIB} ${FFTWF_LIB}) + +if(FFTWL_LIB) + set(FFTW_LIBRARIES ${FFTW_LIBRARIES} ${FFTWL_LIB}) +endif() + +set( CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_SAV} ) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(FFTW DEFAULT_MSG + FFTW_INCLUDES FFTW_LIBRARIES) + +mark_as_advanced(FFTW_INCLUDES FFTW_LIBRARIES FFTW_LIB FFTWF_LIB FFTWL_LIB) + diff --git a/externals/eigen/cmake/FindGLEW.cmake b/externals/eigen/cmake/FindGLEW.cmake new file mode 100644 index 00000000..9d486d5b --- /dev/null +++ b/externals/eigen/cmake/FindGLEW.cmake @@ -0,0 +1,105 @@ +# Copyright (c) 2009 Boudewijn Rempt +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# +# - try to find glew library and include files +# GLEW_INCLUDE_DIR, where to find GL/glew.h, etc. +# GLEW_LIBRARIES, the libraries to link against +# GLEW_FOUND, If false, do not try to use GLEW. +# Also defined, but not for general use are: +# GLEW_GLEW_LIBRARY = the full path to the glew library. + +if (WIN32) + + if(CYGWIN) + + find_path( GLEW_INCLUDE_DIR GL/glew.h) + + find_library( GLEW_GLEW_LIBRARY glew32 + ${OPENGL_LIBRARY_DIR} + /usr/lib/w32api + /usr/X11R6/lib + ) + + + else(CYGWIN) + + find_path( GLEW_INCLUDE_DIR GL/glew.h + $ENV{GLEW_ROOT_PATH}/include + ) + + find_library( GLEW_GLEW_LIBRARY + NAMES glew glew32 + PATHS + $ENV{GLEW_ROOT_PATH}/lib + ${OPENGL_LIBRARY_DIR} + ) + + endif(CYGWIN) + +else (WIN32) + + if (APPLE) +# These values for Apple could probably do with improvement. + find_path( GLEW_INCLUDE_DIR glew.h + /System/Library/Frameworks/GLEW.framework/Versions/A/Headers + ${OPENGL_LIBRARY_DIR} + ) + set(GLEW_GLEW_LIBRARY "-framework GLEW" CACHE STRING "GLEW library for OSX") + set(GLEW_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX") + else (APPLE) + + find_path( GLEW_INCLUDE_DIR GL/glew.h + /usr/include/GL + /usr/openwin/share/include + /usr/openwin/include + /usr/X11R6/include + /usr/include/X11 + /opt/graphics/OpenGL/include + /opt/graphics/OpenGL/contrib/libglew + ) + + find_library( GLEW_GLEW_LIBRARY GLEW + /usr/openwin/lib + /usr/X11R6/lib + ) + + endif (APPLE) + +endif (WIN32) + +set( GLEW_FOUND "NO" ) +if(GLEW_INCLUDE_DIR) + if(GLEW_GLEW_LIBRARY) + # Is -lXi and -lXmu required on all platforms that have it? + # If not, we need some way to figure out what platform we are on. + set( GLEW_LIBRARIES + ${GLEW_GLEW_LIBRARY} + ${GLEW_cocoa_LIBRARY} + ) + set( GLEW_FOUND "YES" ) + +#The following deprecated settings are for backwards compatibility with CMake1.4 + set (GLEW_LIBRARY ${GLEW_LIBRARIES}) + set (GLEW_INCLUDE_PATH ${GLEW_INCLUDE_DIR}) + + endif(GLEW_GLEW_LIBRARY) +endif(GLEW_INCLUDE_DIR) + +if(GLEW_FOUND) + if(NOT GLEW_FIND_QUIETLY) + message(STATUS "Found Glew: ${GLEW_LIBRARIES}") + endif(NOT GLEW_FIND_QUIETLY) +else(GLEW_FOUND) + if(GLEW_FIND_REQUIRED) + message(FATAL_ERROR "Could not find Glew") + endif(GLEW_FIND_REQUIRED) +endif(GLEW_FOUND) + +mark_as_advanced( + GLEW_INCLUDE_DIR + GLEW_GLEW_LIBRARY + GLEW_Xmu_LIBRARY + GLEW_Xi_LIBRARY +) diff --git a/externals/eigen/cmake/FindGMP.cmake b/externals/eigen/cmake/FindGMP.cmake new file mode 100644 index 00000000..c41eedcf --- /dev/null +++ b/externals/eigen/cmake/FindGMP.cmake @@ -0,0 +1,21 @@ +# Try to find the GNU Multiple Precision Arithmetic Library (GMP) +# See http://gmplib.org/ + +if (GMP_INCLUDES AND GMP_LIBRARIES) + set(GMP_FIND_QUIETLY TRUE) +endif () + +find_path(GMP_INCLUDES + NAMES + gmp.h + PATHS + $ENV{GMPDIR} + ${INCLUDE_INSTALL_DIR} +) + +find_library(GMP_LIBRARIES gmp PATHS $ENV{GMPDIR} ${LIB_INSTALL_DIR}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GMP DEFAULT_MSG + GMP_INCLUDES GMP_LIBRARIES) +mark_as_advanced(GMP_INCLUDES GMP_LIBRARIES) diff --git a/externals/eigen/cmake/FindGSL.cmake b/externals/eigen/cmake/FindGSL.cmake new file mode 100644 index 00000000..8632232f --- /dev/null +++ b/externals/eigen/cmake/FindGSL.cmake @@ -0,0 +1,170 @@ +# Try to find gnu scientific library GSL +# See +# http://www.gnu.org/software/gsl/ and +# http://gnuwin32.sourceforge.net/packages/gsl.htm +# +# Once run this will define: +# +# GSL_FOUND = system has GSL lib +# +# GSL_LIBRARIES = full path to the libraries +# on Unix/Linux with additional linker flags from "gsl-config --libs" +# +# CMAKE_GSL_CXX_FLAGS = Unix compiler flags for GSL, essentially "`gsl-config --cxxflags`" +# +# GSL_INCLUDE_DIR = where to find headers +# +# GSL_LINK_DIRECTORIES = link directories, useful for rpath on Unix +# GSL_EXE_LINKER_FLAGS = rpath on Unix +# +# Felix Woelk 07/2004 +# Jan Woetzel +# +# www.mip.informatik.uni-kiel.de +# -------------------------------- + +if(WIN32) + # JW tested with gsl-1.8, Windows XP, MSVS 7.1 + set(GSL_POSSIBLE_ROOT_DIRS + ${GSL_ROOT_DIR} + $ENV{GSL_ROOT_DIR} + ${GSL_DIR} + ${GSL_HOME} + $ENV{GSL_DIR} + $ENV{GSL_HOME} + $ENV{EXTRA} + "C:/Program Files/GnuWin32" + ) + find_path(GSL_INCLUDE_DIR + NAMES gsl/gsl_cdf.h gsl/gsl_randist.h + PATHS ${GSL_POSSIBLE_ROOT_DIRS} + PATH_SUFFIXES include + DOC "GSL header include dir" + ) + + find_library(GSL_GSL_LIBRARY + NAMES libgsl.dll.a gsl libgsl + PATHS ${GSL_POSSIBLE_ROOT_DIRS} + PATH_SUFFIXES lib + DOC "GSL library" ) + + if(NOT GSL_GSL_LIBRARY) + find_file(GSL_GSL_LIBRARY + NAMES libgsl.dll.a + PATHS ${GSL_POSSIBLE_ROOT_DIRS} + PATH_SUFFIXES lib + DOC "GSL library") + endif() + + find_library(GSL_GSLCBLAS_LIBRARY + NAMES libgslcblas.dll.a gslcblas libgslcblas + PATHS ${GSL_POSSIBLE_ROOT_DIRS} + PATH_SUFFIXES lib + DOC "GSL cblas library dir" ) + + if(NOT GSL_GSLCBLAS_LIBRARY) + find_file(GSL_GSLCBLAS_LIBRARY + NAMES libgslcblas.dll.a + PATHS ${GSL_POSSIBLE_ROOT_DIRS} + PATH_SUFFIXES lib + DOC "GSL library") + endif() + + set(GSL_LIBRARIES ${GSL_GSL_LIBRARY}) + + #message("DBG\n" + # "GSL_GSL_LIBRARY=${GSL_GSL_LIBRARY}\n" + # "GSL_GSLCBLAS_LIBRARY=${GSL_GSLCBLAS_LIBRARY}\n" + # "GSL_LIBRARIES=${GSL_LIBRARIES}") + + +else(WIN32) + + if(UNIX) + set(GSL_CONFIG_PREFER_PATH + "$ENV{GSL_DIR}/bin" + "$ENV{GSL_DIR}" + "$ENV{GSL_HOME}/bin" + "$ENV{GSL_HOME}" + CACHE STRING "preferred path to GSL (gsl-config)") + find_program(GSL_CONFIG gsl-config + ${GSL_CONFIG_PREFER_PATH} + /usr/bin/ + ) + # message("DBG GSL_CONFIG ${GSL_CONFIG}") + + if (GSL_CONFIG) + # set CXXFLAGS to be fed into CXX_FLAGS by the user: + set(GSL_CXX_FLAGS "`${GSL_CONFIG} --cflags`") + + # set INCLUDE_DIRS to prefix+include + exec_program(${GSL_CONFIG} + ARGS --prefix + OUTPUT_VARIABLE GSL_PREFIX) + set(GSL_INCLUDE_DIR ${GSL_PREFIX}/include CACHE STRING INTERNAL) + + # set link libraries and link flags + #set(GSL_LIBRARIES "`${GSL_CONFIG} --libs`") + exec_program(${GSL_CONFIG} + ARGS --libs + OUTPUT_VARIABLE GSL_LIBRARIES ) + + # extract link dirs for rpath + exec_program(${GSL_CONFIG} + ARGS --libs + OUTPUT_VARIABLE GSL_CONFIG_LIBS ) + + # extract version + exec_program(${GSL_CONFIG} + ARGS --version + OUTPUT_VARIABLE GSL_FULL_VERSION ) + + # split version as major/minor + string(REGEX MATCH "(.)\\..*" GSL_VERSION_MAJOR_ "${GSL_FULL_VERSION}") + set(GSL_VERSION_MAJOR ${CMAKE_MATCH_1}) + string(REGEX MATCH ".\\.(.*)" GSL_VERSION_MINOR_ "${GSL_FULL_VERSION}") + set(GSL_VERSION_MINOR ${CMAKE_MATCH_1}) + + # split off the link dirs (for rpath) + # use regular expression to match wildcard equivalent "-L*" + # with is a space or a semicolon + string(REGEX MATCHALL "[-][L]([^ ;])+" + GSL_LINK_DIRECTORIES_WITH_PREFIX + "${GSL_CONFIG_LIBS}" ) + # message("DBG GSL_LINK_DIRECTORIES_WITH_PREFIX=${GSL_LINK_DIRECTORIES_WITH_PREFIX}") + + # remove prefix -L because we need the pure directory for LINK_DIRECTORIES + + if (GSL_LINK_DIRECTORIES_WITH_PREFIX) + string(REGEX REPLACE "[-][L]" "" GSL_LINK_DIRECTORIES ${GSL_LINK_DIRECTORIES_WITH_PREFIX} ) + endif (GSL_LINK_DIRECTORIES_WITH_PREFIX) + set(GSL_EXE_LINKER_FLAGS "-Wl,-rpath,${GSL_LINK_DIRECTORIES}" CACHE STRING INTERNAL) + # message("DBG GSL_LINK_DIRECTORIES=${GSL_LINK_DIRECTORIES}") + # message("DBG GSL_EXE_LINKER_FLAGS=${GSL_EXE_LINKER_FLAGS}") + + # add_definitions("-DHAVE_GSL") + # set(GSL_DEFINITIONS "-DHAVE_GSL") + mark_as_advanced( + GSL_CXX_FLAGS + GSL_INCLUDE_DIR + GSL_LIBRARIES + GSL_LINK_DIRECTORIES + GSL_DEFINITIONS + ) + message(STATUS "Using GSL from ${GSL_PREFIX}") + + else(GSL_CONFIG) + message("FindGSL.cmake: gsl-config not found. Please set it manually. GSL_CONFIG=${GSL_CONFIG}") + endif(GSL_CONFIG) + + endif(UNIX) +endif(WIN32) + + +if(GSL_LIBRARIES) + if(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS) + + set(GSL_FOUND 1) + + endif(GSL_INCLUDE_DIR OR GSL_CXX_FLAGS) +endif(GSL_LIBRARIES) diff --git a/externals/eigen/cmake/FindGoogleHash.cmake b/externals/eigen/cmake/FindGoogleHash.cmake new file mode 100644 index 00000000..481eb4da --- /dev/null +++ b/externals/eigen/cmake/FindGoogleHash.cmake @@ -0,0 +1,23 @@ + +if (GOOGLEHASH_INCLUDES AND GOOGLEHASH_LIBRARIES) + set(GOOGLEHASH_FIND_QUIETLY TRUE) +endif () + +find_path(GOOGLEHASH_INCLUDES + NAMES + google/dense_hash_map + PATHS + ${INCLUDE_INSTALL_DIR} +) + +if(GOOGLEHASH_INCLUDES) + # let's make sure it compiles with the current compiler + file(WRITE ${CMAKE_BINARY_DIR}/googlehash_test.cpp + "#include \n#include \nint main(int argc, char** argv) { google::dense_hash_map a; google::sparse_hash_map b; return 0;}\n") + try_compile(GOOGLEHASH_COMPILE ${CMAKE_BINARY_DIR} ${CMAKE_BINARY_DIR}/googlehash_test.cpp OUTPUT_VARIABLE GOOGLEHASH_COMPILE_RESULT) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GoogleHash DEFAULT_MSG GOOGLEHASH_INCLUDES GOOGLEHASH_COMPILE) + +mark_as_advanced(GOOGLEHASH_INCLUDES) diff --git a/externals/eigen/cmake/FindHWLOC.cmake b/externals/eigen/cmake/FindHWLOC.cmake new file mode 100644 index 00000000..522f5215 --- /dev/null +++ b/externals/eigen/cmake/FindHWLOC.cmake @@ -0,0 +1,332 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find HWLOC include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(HWLOC +# [REQUIRED]) # Fail with error if hwloc is not found +# +# This module finds headers and hwloc library. +# Results are reported in variables: +# HWLOC_FOUND - True if headers and requested libraries were found +# HWLOC_INCLUDE_DIRS - hwloc include directories +# HWLOC_LIBRARY_DIRS - Link directories for hwloc libraries +# HWLOC_LIBRARIES - hwloc component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DHWLOC_DIR=path/to/hwloc): +# HWLOC_DIR - Where to find the base directory of hwloc +# HWLOC_INCDIR - Where to find the header files +# HWLOC_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: HWLOC_DIR, HWLOC_INCDIR, HWLOC_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +include(CheckStructHasMember) +include(CheckCSourceCompiles) + +if (NOT HWLOC_FOUND) + set(HWLOC_DIR "" CACHE PATH "Installation directory of HWLOC library") + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "A cache variable, namely HWLOC_DIR, has been set to specify the install directory of HWLOC") + endif() +endif() + +set(ENV_HWLOC_DIR "$ENV{HWLOC_DIR}") +set(ENV_HWLOC_INCDIR "$ENV{HWLOC_INCDIR}") +set(ENV_HWLOC_LIBDIR "$ENV{HWLOC_LIBDIR}") +set(HWLOC_GIVEN_BY_USER "FALSE") +if ( HWLOC_DIR OR ( HWLOC_INCDIR AND HWLOC_LIBDIR) OR ENV_HWLOC_DIR OR (ENV_HWLOC_INCDIR AND ENV_HWLOC_LIBDIR) ) + set(HWLOC_GIVEN_BY_USER "TRUE") +endif() + +# Optionally use pkg-config to detect include/library dirs (if pkg-config is available) +# ------------------------------------------------------------------------------------- +include(CMakeFindDependencyMacro) +# include(FindPkgConfig) +find_dependency(PkgConfig QUIET) +if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) + + pkg_search_module(HWLOC hwloc) + if (NOT HWLOC_FIND_QUIETLY) + if (HWLOC_FOUND AND HWLOC_LIBRARIES) + message(STATUS "Looking for HWLOC - found using PkgConfig") + #if(NOT HWLOC_INCLUDE_DIRS) + # message("${Magenta}HWLOC_INCLUDE_DIRS is empty using PkgConfig." + # "Perhaps the path to hwloc headers is already present in your" + # "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}") + #endif() + else() + message(STATUS "${Magenta}Looking for HWLOC - not found using PkgConfig." + "\n Perhaps you should add the directory containing hwloc.pc to" + "\n the PKG_CONFIG_PATH environment variable.${ColourReset}") + endif() + endif() + +endif() + +if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) ) + + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for HWLOC - PkgConfig not used") + endif() + + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + if(ENV_HWLOC_INCDIR) + list(APPEND _inc_env "${ENV_HWLOC_INCDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _inc_env "${ENV_HWLOC_DIR}") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include/hwloc") + else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") + + # Try to find the hwloc header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(HWLOC_INCDIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_INCDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_DIR} + PATH_SUFFIXES "include" "include/hwloc") + else() + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${PATH_TO_LOOK_FOR} + PATH_SUFFIXES "hwloc") + endif() + endif() + mark_as_advanced(HWLOC_hwloc.h_DIRS) + + # Add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc.h_DIRS) + set(HWLOC_INCLUDE_DIRS "${HWLOC_hwloc.h_DIRS}") + else () + set(HWLOC_INCLUDE_DIRS "HWLOC_INCLUDE_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- hwloc.h not found") + endif() + endif () + + if (HWLOC_INCLUDE_DIRS) + list(REMOVE_DUPLICATES HWLOC_INCLUDE_DIRS) + endif () + + + # Looking for lib + # --------------- + + # Add system library paths to search lib + # -------------------------------------- + unset(_lib_env) + if(ENV_HWLOC_LIBDIR) + list(APPEND _lib_env "${ENV_HWLOC_LIBDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _lib_env "${ENV_HWLOC_DIR}") + list(APPEND _lib_env "${ENV_HWLOC_DIR}/lib") + else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif() + list(REMOVE_DUPLICATES _lib_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_lib_env}") + + # Try to find the hwloc lib in the given paths + # ---------------------------------------------- + + # call cmake macro to find the lib path + if(HWLOC_LIBDIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_LIBDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(HWLOC_hwloc_LIBRARY) + + # If found, add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc_LIBRARY) + get_filename_component(hwloc_lib_path ${HWLOC_hwloc_LIBRARY} PATH) + # set cmake variables (respects naming convention) + set(HWLOC_LIBRARIES "${HWLOC_hwloc_LIBRARY}") + set(HWLOC_LIBRARY_DIRS "${hwloc_lib_path}") + else () + set(HWLOC_LIBRARIES "HWLOC_LIBRARIES-NOTFOUND") + set(HWLOC_LIBRARY_DIRS "HWLOC_LIBRARY_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- lib hwloc not found") + endif() + endif () + + if (HWLOC_LIBRARY_DIRS) + list(REMOVE_DUPLICATES HWLOC_LIBRARY_DIRS) + endif () + + # check a function to validate the find + if(HWLOC_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # HWLOC + if (HWLOC_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + if (HWLOC_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${HWLOC_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${HWLOC_LIBRARIES}") + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(HWLOC_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(hwloc_topology_init HWLOC_WORKS) + mark_as_advanced(HWLOC_WORKS) + + if(NOT HWLOC_WORKS) + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc : test of hwloc_topology_init with hwloc library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) + endif() + +endif() + +if (HWLOC_LIBRARIES) + if (HWLOC_LIBRARY_DIRS) + list(GET HWLOC_LIBRARY_DIRS 0 first_lib_path) + else() + list(GET HWLOC_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + endif() + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(HWLOC_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of HWLOC library" FORCE) + else() + set(HWLOC_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of HWLOC library" FORCE) + endif() +endif() +mark_as_advanced(HWLOC_DIR) +mark_as_advanced(HWLOC_DIR_FOUND) + +# check that HWLOC has been found +# ------------------------------- +include(FindPackageHandleStandardArgs) +if (PKG_CONFIG_EXECUTABLE AND HWLOC_FOUND) + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES) +else() + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES + HWLOC_WORKS) +endif() + +if (HWLOC_FOUND) + set(HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) + list(APPEND CMAKE_REQUIRED_INCLUDES ${HWLOC_INCLUDE_DIRS}) + + # test headers to guess the version + check_struct_has_member( "struct hwloc_obj" parent hwloc.h HAVE_HWLOC_PARENT_MEMBER ) + check_struct_has_member( "struct hwloc_cache_attr_s" size hwloc.h HAVE_HWLOC_CACHE_ATTR ) + check_c_source_compiles( "#include + int main(void) { hwloc_obj_t o; o->type = HWLOC_OBJ_PU; return 0;}" HAVE_HWLOC_OBJ_PU) + include(CheckLibraryExists) + check_library_exists(${HWLOC_LIBRARIES} hwloc_bitmap_free "" HAVE_HWLOC_BITMAP) + + set(CMAKE_REQUIRED_INCLUDES ${HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES}) +endif() diff --git a/externals/eigen/cmake/FindKLU.cmake b/externals/eigen/cmake/FindKLU.cmake new file mode 100644 index 00000000..6217d149 --- /dev/null +++ b/externals/eigen/cmake/FindKLU.cmake @@ -0,0 +1,48 @@ +# KLU lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (KLU_INCLUDES AND KLU_LIBRARIES) + set(KLU_FIND_QUIETLY TRUE) +endif () + +find_path(KLU_INCLUDES + NAMES + klu.h + PATHS + $ENV{KLUDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +find_library(KLU_LIBRARIES klu PATHS $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + +if(KLU_LIBRARIES) + + if(NOT KLU_LIBDIR) + get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH) + endif() + + find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(COLAMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY}) + endif () + + find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(AMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY}) + endif () + + find_library(BTF_LIBRARY btf PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(BTF_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${BTF_LIBRARY}) + endif() + +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(KLU DEFAULT_MSG + KLU_INCLUDES KLU_LIBRARIES) + +mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY BTF_LIBRARY) diff --git a/externals/eigen/cmake/FindLAPACK.cmake b/externals/eigen/cmake/FindLAPACK.cmake new file mode 100644 index 00000000..3fd73880 --- /dev/null +++ b/externals/eigen/cmake/FindLAPACK.cmake @@ -0,0 +1,274 @@ +# Find LAPACK library +# +# This module finds an installed library that implements the LAPACK +# linear-algebra interface (see http://www.netlib.org/lapack/). +# The approach follows mostly that taken for the autoconf macro file, acx_lapack.m4 +# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html). +# +# This module sets the following variables: +# LAPACK_FOUND - set to true if a library implementing the LAPACK interface +# is found +# LAPACK_INCLUDE_DIR - Directories containing the LAPACK header files +# LAPACK_DEFINITIONS - Compilation options to use LAPACK +# LAPACK_LINKER_FLAGS - Linker flags to use LAPACK (excluding -l +# and -L). +# LAPACK_LIBRARIES_DIR - Directories containing the LAPACK libraries. +# May be null if LAPACK_LIBRARIES contains libraries name using full path. +# LAPACK_LIBRARIES - List of libraries to link against LAPACK interface. +# May be null if the compiler supports auto-link (e.g. VC++). +# LAPACK_USE_FILE - The name of the cmake module to include to compile +# applications or libraries using LAPACK. +# +# This module was modified by CGAL team: +# - find libraries for a C++ compiler, instead of Fortran +# - added LAPACK_INCLUDE_DIR, LAPACK_DEFINITIONS and LAPACK_LIBRARIES_DIR +# - removed LAPACK95_LIBRARIES + + +include(CheckFunctionExists) +include(CMakeFindDependencyMacro) + +# This macro checks for the existence of the combination of fortran libraries +# given by _list. If the combination is found, this macro checks (using the +# check_function_exists macro) whether can link against that library +# combination using the name of a routine given by _name using the linker +# flags given by _flags. If the combination of libraries is found and passes +# the link test, LIBRARIES is set to the list of complete library paths that +# have been found and DEFINITIONS to the required definitions. +# Otherwise, LIBRARIES is set to FALSE. +# N.B. _prefix is the prefix applied to the names of all cached variables that +# are generated internally and marked advanced by this macro. +macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _blas _path) + #message("DEBUG: check_lapack_libraries(${_list} in ${_path} with ${_blas})") + + # Check for the existence of the libraries given by _list + set(_libraries_found TRUE) + set(_libraries_work FALSE) + set(${DEFINITIONS} "") + set(${LIBRARIES} "") + set(_combined_name) + foreach(_library ${_list}) + set(_combined_name ${_combined_name}_${_library}) + + if(_libraries_found) + # search first in ${_path} + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS ${_path} NO_DEFAULT_PATH + ) + # if not found, search in environment variables and system + if ( WIN32 ) + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS ENV LIB + ) + elseif ( APPLE ) + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH + ) + else () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH + ) + endif() + mark_as_advanced(${_prefix}_${_library}_LIBRARY) + set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) + set(_libraries_found ${${_prefix}_${_library}_LIBRARY}) + endif() + endforeach() + if(_libraries_found) + set(_libraries_found ${${LIBRARIES}}) + endif() + + # Test this combination of libraries with the Fortran/f2c interface. + # We test the Fortran interface first as it is well standardized. + if(_libraries_found AND NOT _libraries_work) + set(${DEFINITIONS} "-D${_prefix}_USE_F2C") + set(${LIBRARIES} ${_libraries_found}) + # Some C++ linkers require the f2c library to link with Fortran libraries. + # I do not know which ones, thus I just add the f2c library if it is available. + find_dependency( F2C QUIET ) + if ( F2C_FOUND ) + set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) + set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) + endif() + set(CMAKE_REQUIRED_DEFINITIONS ${${DEFINITIONS}}) + set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas}) + #message("DEBUG: CMAKE_REQUIRED_DEFINITIONS = ${CMAKE_REQUIRED_DEFINITIONS}") + #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") + # Check if function exists with f2c calling convention (ie a trailing underscore) + check_function_exists(${_name}_ ${_prefix}_${_name}_${_combined_name}_f2c_WORKS) + set(CMAKE_REQUIRED_DEFINITIONS} "") + set(CMAKE_REQUIRED_LIBRARIES "") + mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS) + set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS}) + endif() + + # If not found, test this combination of libraries with a C interface. + # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard. + if(_libraries_found AND NOT _libraries_work) + set(${DEFINITIONS} "") + set(${LIBRARIES} ${_libraries_found}) + set(CMAKE_REQUIRED_DEFINITIONS "") + set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas}) + #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") + check_function_exists(${_name} ${_prefix}_${_name}${_combined_name}_WORKS) + set(CMAKE_REQUIRED_LIBRARIES "") + mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS) + set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS}) + endif() + + # on failure + if(NOT _libraries_work) + set(${DEFINITIONS} "") + set(${LIBRARIES} FALSE) + endif() + #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}") + #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") +endmacro() + + +# +# main +# + +# LAPACK requires BLAS +if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) + find_dependency(BLAS) +else() + find_dependency(BLAS REQUIRED) +endif() + +if (NOT BLAS_FOUND) + + message(STATUS "LAPACK requires BLAS.") + set(LAPACK_FOUND FALSE) + +# Is it already configured? +elseif (LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES) + + set(LAPACK_FOUND TRUE) + +else() + + # reset variables + set( LAPACK_INCLUDE_DIR "" ) + set( LAPACK_DEFINITIONS "" ) + set( LAPACK_LINKER_FLAGS "" ) # unused (yet) + set( LAPACK_LIBRARIES "" ) + set( LAPACK_LIBRARIES_DIR "" ) + + # + # If Unix, search for LAPACK function in possible libraries + # + + #intel mkl lapack? + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_DEFINITIONS + LAPACK_LIBRARIES + LAPACK + cheev + "" + "mkl_lapack" + "${BLAS_LIBRARIES}" + "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" + ) + endif() + + #acml lapack? + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_DEFINITIONS + LAPACK_LIBRARIES + LAPACK + cheev + "" + "acml" + "${BLAS_LIBRARIES}" + "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" + ) + endif() + + # Apple LAPACK library? + if(NOT LAPACK_LIBRARIES) + check_lapack_libraries( + LAPACK_DEFINITIONS + LAPACK_LIBRARIES + LAPACK + cheev + "" + "Accelerate" + "${BLAS_LIBRARIES}" + "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" + ) + endif() + + if ( NOT LAPACK_LIBRARIES ) + check_lapack_libraries( + LAPACK_DEFINITIONS + LAPACK_LIBRARIES + LAPACK + cheev + "" + "vecLib" + "${BLAS_LIBRARIES}" + "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" + ) + endif () + + # Generic LAPACK library? + # This configuration *must* be the last try as this library is notably slow. + if ( NOT LAPACK_LIBRARIES ) + check_lapack_libraries( + LAPACK_DEFINITIONS + LAPACK_LIBRARIES + LAPACK + cheev + "" + "lapack" + "${BLAS_LIBRARIES}" + "${CGAL_TAUCS_LIBRARIES_DIR} ENV LAPACK_LIB_DIR" + ) + endif() + + if(LAPACK_LIBRARIES_DIR OR LAPACK_LIBRARIES) + set(LAPACK_FOUND TRUE) + else() + set(LAPACK_FOUND FALSE) + endif() + + if(NOT LAPACK_FIND_QUIETLY) + if(LAPACK_FOUND) + message(STATUS "A library with LAPACK API found.") + else() + if(LAPACK_FIND_REQUIRED) + message(FATAL_ERROR "A required library with LAPACK API not found. Please specify library location.") + else() + message(STATUS "A library with LAPACK API not found. Please specify library location.") + endif() + endif() + endif() + + # Add variables to cache + set( LAPACK_INCLUDE_DIR "${LAPACK_INCLUDE_DIR}" + CACHE PATH "Directories containing the LAPACK header files" FORCE ) + set( LAPACK_DEFINITIONS "${LAPACK_DEFINITIONS}" + CACHE STRING "Compilation options to use LAPACK" FORCE ) + set( LAPACK_LINKER_FLAGS "${LAPACK_LINKER_FLAGS}" + CACHE STRING "Linker flags to use LAPACK" FORCE ) + set( LAPACK_LIBRARIES "${LAPACK_LIBRARIES}" + CACHE FILEPATH "LAPACK libraries name" FORCE ) + set( LAPACK_LIBRARIES_DIR "${LAPACK_LIBRARIES_DIR}" + CACHE PATH "Directories containing the LAPACK libraries" FORCE ) + + #message("DEBUG: LAPACK_INCLUDE_DIR = ${LAPACK_INCLUDE_DIR}") + #message("DEBUG: LAPACK_DEFINITIONS = ${LAPACK_DEFINITIONS}") + #message("DEBUG: LAPACK_LINKER_FLAGS = ${LAPACK_LINKER_FLAGS}") + #message("DEBUG: LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}") + #message("DEBUG: LAPACK_LIBRARIES_DIR = ${LAPACK_LIBRARIES_DIR}") + #message("DEBUG: LAPACK_FOUND = ${LAPACK_FOUND}") + +endif() diff --git a/externals/eigen/cmake/FindMPFR.cmake b/externals/eigen/cmake/FindMPFR.cmake new file mode 100644 index 00000000..d8da9d6f --- /dev/null +++ b/externals/eigen/cmake/FindMPFR.cmake @@ -0,0 +1,83 @@ +# Try to find the MPFR library +# See http://www.mpfr.org/ +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(MPFR 2.3.0) +# to require version 2.3.0 to newer of MPFR. +# +# Once done this will define +# +# MPFR_FOUND - system has MPFR lib with correct version +# MPFR_INCLUDES - the MPFR include directory +# MPFR_LIBRARIES - the MPFR library +# MPFR_VERSION - MPFR version + +# Copyright (c) 2006, 2007 Montel Laurent, +# Copyright (c) 2008, 2009 Gael Guennebaud, +# Copyright (c) 2010 Jitse Niesen, +# Redistribution and use is allowed according to the terms of the BSD license. + +# Set MPFR_INCLUDES + +find_path(MPFR_INCLUDES + NAMES + mpfr.h + PATHS + $ENV{GMPDIR} + ${INCLUDE_INSTALL_DIR} +) + +# Set MPFR_FIND_VERSION to 1.0.0 if no minimum version is specified + +if(NOT MPFR_FIND_VERSION) + if(NOT MPFR_FIND_VERSION_MAJOR) + set(MPFR_FIND_VERSION_MAJOR 1) + endif() + if(NOT MPFR_FIND_VERSION_MINOR) + set(MPFR_FIND_VERSION_MINOR 0) + endif() + if(NOT MPFR_FIND_VERSION_PATCH) + set(MPFR_FIND_VERSION_PATCH 0) + endif() + + set(MPFR_FIND_VERSION "${MPFR_FIND_VERSION_MAJOR}.${MPFR_FIND_VERSION_MINOR}.${MPFR_FIND_VERSION_PATCH}") +endif() + + +if(MPFR_INCLUDES) + + # Set MPFR_VERSION + + file(READ "${MPFR_INCLUDES}/mpfr.h" _mpfr_version_header) + + string(REGEX MATCH "define[ \t]+MPFR_VERSION_MAJOR[ \t]+([0-9]+)" _mpfr_major_version_match "${_mpfr_version_header}") + set(MPFR_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPFR_VERSION_MINOR[ \t]+([0-9]+)" _mpfr_minor_version_match "${_mpfr_version_header}") + set(MPFR_MINOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPFR_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpfr_patchlevel_version_match "${_mpfr_version_header}") + set(MPFR_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}") + + set(MPFR_VERSION ${MPFR_MAJOR_VERSION}.${MPFR_MINOR_VERSION}.${MPFR_PATCHLEVEL_VERSION}) + + # Check whether found version exceeds minimum version + + if(${MPFR_VERSION} VERSION_LESS ${MPFR_FIND_VERSION}) + set(MPFR_VERSION_OK FALSE) + message(STATUS "MPFR version ${MPFR_VERSION} found in ${MPFR_INCLUDES}, " + "but at least version ${MPFR_FIND_VERSION} is required") + else() + set(MPFR_VERSION_OK TRUE) + endif() + +endif() + +# Set MPFR_LIBRARIES + +find_library(MPFR_LIBRARIES mpfr PATHS $ENV{GMPDIR} ${LIB_INSTALL_DIR}) + +# Epilogue + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MPFR DEFAULT_MSG + MPFR_INCLUDES MPFR_LIBRARIES MPFR_VERSION_OK) +mark_as_advanced(MPFR_INCLUDES MPFR_LIBRARIES) diff --git a/externals/eigen/cmake/FindMPREAL.cmake b/externals/eigen/cmake/FindMPREAL.cmake new file mode 100644 index 00000000..947a1ce8 --- /dev/null +++ b/externals/eigen/cmake/FindMPREAL.cmake @@ -0,0 +1,103 @@ +# Try to find the MPFR C++ (MPREAL) library +# See http://www.holoborodko.com/pavel/mpreal/ +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(MPREAL 1.8.6) +# to require version 1.8.6 or newer of MPREAL C++. +# +# Once done this will define +# +# MPREAL_FOUND - system has MPREAL lib with correct version +# MPREAL_INCLUDES - MPREAL required include directories +# MPREAL_LIBRARIES - MPREAL required libraries +# MPREAL_VERSION - MPREAL version + +# Copyright (c) 2020 The Eigen Authors. +# Redistribution and use is allowed according to the terms of the BSD license. + +include(CMakeFindDependencyMacro) +find_dependency(MPFR) +find_dependency(GMP) + +# Set MPREAL_INCLUDES +find_path(MPREAL_INCLUDES + NAMES + mpreal.h + PATHS + $ENV{GMPDIR} + ${INCLUDE_INSTALL_DIR} +) + +# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified + +if(NOT MPREAL_FIND_VERSION) + if(NOT MPREAL_FIND_VERSION_MAJOR) + set(MPREAL_FIND_VERSION_MAJOR 1) + endif() + if(NOT MPREAL_FIND_VERSION_MINOR) + set(MPREAL_FIND_VERSION_MINOR 0) + endif() + if(NOT MPREAL_FIND_VERSION_PATCH) + set(MPREAL_FIND_VERSION_PATCH 0) + endif() + + set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}") +endif() + +# Check bugs +# - https://github.com/advanpix/mpreal/issues/7 +# - https://github.com/advanpix/mpreal/issues/9 +set(MPREAL_TEST_PROGRAM " +#include +#include +int main(int argc, char** argv) { + const mpfr::mpreal one = 1.0; + const mpfr::mpreal zero = 0.0; + using namespace std; + const mpfr::mpreal smaller = min(one, zero); + return 0; +}") + +if(MPREAL_INCLUDES) + + # Set MPREAL_VERSION + + file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header) + + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}") + set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}") + set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}") + set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}") + + set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION}) + + # Check whether found version exceeds minimum version + + if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION}) + set(MPREAL_VERSION_OK FALSE) + message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, " + "but at least version ${MPREAL_FIND_VERSION} is required") + else() + set(MPREAL_VERSION_OK TRUE) + + list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}") + list(REMOVE_DUPLICATES MPREAL_INCLUDES) + + list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}") + list(REMOVE_DUPLICATES MPREAL_LIBRARIES) + + # Make sure it compiles with the current compiler. + unset(MPREAL_WORKS CACHE) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}") + set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}") + check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MPREAL DEFAULT_MSG + MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS) +mark_as_advanced(MPREAL_INCLUDES) diff --git a/externals/eigen/cmake/FindMetis.cmake b/externals/eigen/cmake/FindMetis.cmake new file mode 100644 index 00000000..747f8827 --- /dev/null +++ b/externals/eigen/cmake/FindMetis.cmake @@ -0,0 +1,265 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find METIS include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(METIS +# [REQUIRED] # Fail with error if metis is not found +# ) +# +# This module finds headers and metis library. +# Results are reported in variables: +# METIS_FOUND - True if headers and requested libraries were found +# METIS_INCLUDE_DIRS - metis include directories +# METIS_LIBRARY_DIRS - Link directories for metis libraries +# METIS_LIBRARIES - metis component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis): +# METIS_DIR - Where to find the base directory of metis +# METIS_INCDIR - Where to find the header files +# METIS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +if (NOT METIS_FOUND) + set(METIS_DIR "" CACHE PATH "Installation directory of METIS library") + if (NOT METIS_FIND_QUIETLY) + message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS") + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_METIS_DIR "$ENV{METIS_DIR}") +set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}") +if(ENV_METIS_INCDIR) + list(APPEND _inc_env "${ENV_METIS_INCDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _inc_env "${ENV_METIS_DIR}") + list(APPEND _inc_env "${ENV_METIS_DIR}/include") + list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the metis header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(METIS_INCDIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_INCDIR}) +else() + if(METIS_DIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_DIR} + PATH_SUFFIXES "include" "include/metis") + else() + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${_inc_env}) + endif() +endif() +mark_as_advanced(METIS_metis.h_DIRS) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis.h_DIRS) + set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}") +else () + set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- metis.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}") +if(ENV_METIS_LIBDIR) + list(APPEND _lib_env "${ENV_METIS_LIBDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _lib_env "${ENV_METIS_DIR}") + list(APPEND _lib_env "${ENV_METIS_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the metis lib in the given paths +# ---------------------------------------------- +# call cmake macro to find the lib path +if(METIS_LIBDIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_LIBDIR}) +else() + if(METIS_DIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${_lib_env}) + endif() +endif() +mark_as_advanced(METIS_metis_LIBRARY) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis_LIBRARY) + get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH) + # set cmake variables + set(METIS_LIBRARIES "${METIS_metis_LIBRARY}") + set(METIS_LIBRARY_DIRS "${metis_lib_path}") +else () + set(METIS_LIBRARIES "METIS_LIBRARIES-NOTFOUND") + set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- lib metis not found") + endif() +endif () + +# check a function to validate the find +if(METIS_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # METIS + if (METIS_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + if (METIS_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${METIS_LIBRARIES}") + # m + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(METIS_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(METIS_NodeND METIS_WORKS) + mark_as_advanced(METIS_WORKS) + + if(NOT METIS_WORKS) + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif() + +if (METIS_LIBRARIES) + list(GET METIS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE) + else() + set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE) + endif() +endif() +mark_as_advanced(METIS_DIR) +mark_as_advanced(METIS_DIR_FOUND) + +# check that METIS has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(METIS DEFAULT_MSG + METIS_LIBRARIES + METIS_WORKS + METIS_INCLUDE_DIRS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/externals/eigen/cmake/FindPASTIX.cmake b/externals/eigen/cmake/FindPASTIX.cmake new file mode 100644 index 00000000..db1427b0 --- /dev/null +++ b/externals/eigen/cmake/FindPASTIX.cmake @@ -0,0 +1,704 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PASTIX include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PASTIX +# [REQUIRED] # Fail with error if pastix is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PASTIX depends on the following libraries: +# - Threads, m, rt +# - MPI +# - HWLOC +# - BLAS +# +# COMPONENTS are optional libraries PASTIX could be linked with, +# Use it to drive detection of a specific compilation chain +# COMPONENTS can be some of the following: +# - MPI: to activate detection of the parallel MPI version (default) +# it looks for Threads, HWLOC, BLAS, MPI and ScaLAPACK libraries +# - SEQ: to activate detection of the sequential version (exclude MPI version) +# - STARPU: to activate detection of StarPU version +# it looks for MPI version of StarPU (default behaviour) +# if SEQ and STARPU are given, it looks for a StarPU without MPI +# - STARPU_CUDA: to activate detection of StarPU with CUDA +# - STARPU_FXT: to activate detection of StarPU with FxT +# - SCOTCH: to activate detection of PASTIX linked with SCOTCH +# - PTSCOTCH: to activate detection of PASTIX linked with SCOTCH +# - METIS: to activate detection of PASTIX linked with SCOTCH +# +# This module finds headers and pastix library. +# Results are reported in variables: +# PASTIX_FOUND - True if headers and requested libraries were found +# PASTIX_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PASTIX_INCLUDE_DIRS - pastix include directories +# PASTIX_LIBRARY_DIRS - Link directories for pastix libraries +# PASTIX_LIBRARIES - pastix libraries +# PASTIX_INCLUDE_DIRS_DEP - pastix + dependencies include directories +# PASTIX_LIBRARY_DIRS_DEP - pastix + dependencies link directories +# PASTIX_LIBRARIES_DEP - pastix libraries + dependencies +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPASTIX_DIR=path/to/pastix): +# PASTIX_DIR - Where to find the base directory of pastix +# PASTIX_INCDIR - Where to find the header files +# PASTIX_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PASTIX_DIR, PASTIX_INCDIR, PASTIX_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + + +if (NOT PASTIX_FOUND) + set(PASTIX_DIR "" CACHE PATH "Installation directory of PASTIX library") + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "A cache variable, namely PASTIX_DIR, has been set to specify the install directory of PASTIX") + endif() +endif() + +# Set the version to find +set(PASTIX_LOOK_FOR_MPI ON) +set(PASTIX_LOOK_FOR_SEQ OFF) +set(PASTIX_LOOK_FOR_STARPU OFF) +set(PASTIX_LOOK_FOR_STARPU_CUDA OFF) +set(PASTIX_LOOK_FOR_STARPU_FXT OFF) +set(PASTIX_LOOK_FOR_SCOTCH ON) +set(PASTIX_LOOK_FOR_PTSCOTCH OFF) +set(PASTIX_LOOK_FOR_METIS OFF) + +if( PASTIX_FIND_COMPONENTS ) + foreach( component ${PASTIX_FIND_COMPONENTS} ) + if (${component} STREQUAL "SEQ") + # means we look for the sequential version of PaStiX (without MPI) + set(PASTIX_LOOK_FOR_SEQ ON) + set(PASTIX_LOOK_FOR_MPI OFF) + endif() + if (${component} STREQUAL "MPI") + # means we look for the MPI version of PaStiX (default) + set(PASTIX_LOOK_FOR_SEQ OFF) + set(PASTIX_LOOK_FOR_MPI ON) + endif() + if (${component} STREQUAL "STARPU") + # means we look for PaStiX with StarPU + set(PASTIX_LOOK_FOR_STARPU ON) + endif() + if (${component} STREQUAL "STARPU_CUDA") + # means we look for PaStiX with StarPU + CUDA + set(PASTIX_LOOK_FOR_STARPU ON) + set(PASTIX_LOOK_FOR_STARPU_CUDA ON) + endif() + if (${component} STREQUAL "STARPU_FXT") + # means we look for PaStiX with StarPU + FxT + set(PASTIX_LOOK_FOR_STARPU_FXT ON) + endif() + if (${component} STREQUAL "SCOTCH") + set(PASTIX_LOOK_FOR_SCOTCH ON) + endif() + if (${component} STREQUAL "PTSCOTCH") + set(PASTIX_LOOK_FOR_PTSCOTCH ON) + endif() + if (${component} STREQUAL "METIS") + set(PASTIX_LOOK_FOR_METIS ON) + endif() + endforeach() +endif() + +# Dependencies detection +# ---------------------- + + +# Required dependencies +# --------------------- +include(CMakeFindDependencyMacro) +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect pthread") +endif() +if (PASTIX_FIND_REQUIRED) + find_dependency(Threads REQUIRED QUIET) +else() + find_dependency(Threads QUIET) +endif() +set(PASTIX_EXTRA_LIBRARIES "") +if( THREADS_FOUND ) + list(APPEND PASTIX_EXTRA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) +endif () + +# Add math library to the list of extra +# it normally exists on all common systems provided with a C compiler +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect libm") +endif() +set(PASTIX_M_LIBRARIES "") +if(UNIX OR WIN32) + find_library( + PASTIX_M_m_LIBRARY + NAMES m + ) + mark_as_advanced(PASTIX_M_m_LIBRARY) + if (PASTIX_M_m_LIBRARY) + list(APPEND PASTIX_M_LIBRARIES "${PASTIX_M_m_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_M_m_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find libm on your system." + "Are you sure to a have a C compiler installed?") + endif() + endif() +endif() + +# Try to find librt (libposix4 - POSIX.1b Realtime Extensions library) +# on Unix systems except Apple ones because it does not exist on it +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect librt") +endif() +set(PASTIX_RT_LIBRARIES "") +if(UNIX AND NOT APPLE) + find_library( + PASTIX_RT_rt_LIBRARY + NAMES rt + ) + mark_as_advanced(PASTIX_RT_rt_LIBRARY) + if (PASTIX_RT_rt_LIBRARY) + list(APPEND PASTIX_RT_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find librt on your system") + endif() + endif() +endif() + +# PASTIX depends on HWLOC +#------------------------ +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect HWLOC") +endif() +if (PASTIX_FIND_REQUIRED) + find_dependency(HWLOC REQUIRED QUIET) +else() + find_dependency(HWLOC QUIET) +endif() + +# PASTIX depends on BLAS +#----------------------- +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect BLAS") +endif() +if (PASTIX_FIND_REQUIRED) + find_dependency(BLASEXT REQUIRED QUIET) +else() + find_dependency(BLASEXT QUIET) +endif() + +# Optional dependencies +# --------------------- + +# PASTIX may depend on MPI +#------------------------- +if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect MPI") + endif() + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI) + find_dependency(MPI REQUIRED QUIET) + else() + find_dependency(MPI QUIET) + endif() + if (MPI_FOUND) + mark_as_advanced(MPI_LIBRARY) + mark_as_advanced(MPI_EXTRA_LIBRARY) + endif() +endif () + +# PASTIX may depend on STARPU +#---------------------------- +if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) + + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect StarPU") + endif() + + set(PASTIX_STARPU_VERSION "1.1" CACHE STRING "oldest STARPU version desired") + + # create list of components in order to make a single call to find_package(starpu...) + # we explicitly need a StarPU version built with hwloc + set(STARPU_COMPONENT_LIST "HWLOC") + + # StarPU may depend on MPI + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if (PASTIX_LOOK_FOR_MPI) + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + list(APPEND STARPU_COMPONENT_LIST "MPI") + endif() + if (PASTIX_LOOK_FOR_STARPU_CUDA) + list(APPEND STARPU_COMPONENT_LIST "CUDA") + endif() + if (PASTIX_LOOK_FOR_STARPU_FXT) + list(APPEND STARPU_COMPONENT_LIST "FXT") + endif() + # set the list of optional dependencies we may discover + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU) + find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED + COMPONENTS ${STARPU_COMPONENT_LIST}) + else() + find_dependency(STARPU ${PASTIX_STARPU_VERSION} + COMPONENTS ${STARPU_COMPONENT_LIST}) + endif() + +endif() + +# PASTIX may depends on SCOTCH +#----------------------------- +if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect SCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH) + find_dependency(SCOTCH REQUIRED QUIET) + else() + find_dependency(SCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on PTSCOTCH +#------------------------------- +if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH) + find_dependency(PTSCOTCH REQUIRED QUIET) + else() + find_dependency(PTSCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on METIS +#---------------------------- +if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect METIS") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS) + find_dependency(METIS REQUIRED QUIET) + else() + find_dependency(METIS QUIET) + endif() +endif() + +# Error if pastix required and no partitioning lib found +if (PASTIX_FIND_REQUIRED AND NOT SCOTCH_FOUND AND NOT PTSCOTCH_FOUND AND NOT METIS_FOUND) + message(FATAL_ERROR "Could NOT find any partitioning library on your system" + " (install scotch, ptscotch or metis)") +endif() + + +# Looking for PaStiX +# ------------------ + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PASTIX_DIR "$ENV{PASTIX_DIR}") +set(ENV_PASTIX_INCDIR "$ENV{PASTIX_INCDIR}") +if(ENV_PASTIX_INCDIR) + list(APPEND _inc_env "${ENV_PASTIX_INCDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _inc_env "${ENV_PASTIX_DIR}") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include/pastix") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the pastix header in the given paths +# --------------------------------------------------- +# call cmake macro to find the header path +if(PASTIX_INCDIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_INCDIR}) +else() + if(PASTIX_DIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_DIR} + PATH_SUFFIXES "include" "include/pastix") + else() + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${_inc_env} + PATH_SUFFIXES "pastix") + endif() +endif() +mark_as_advanced(PASTIX_pastix.h_DIRS) + +# If found, add path to cmake variable +# ------------------------------------ +if (PASTIX_pastix.h_DIRS) + set(PASTIX_INCLUDE_DIRS "${PASTIX_pastix.h_DIRS}") +else () + set(PASTIX_INCLUDE_DIRS "PASTIX_INCLUDE_DIRS-NOTFOUND") + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for pastix -- pastix.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PASTIX_LIBDIR "$ENV{PASTIX_LIBDIR}") +if(ENV_PASTIX_LIBDIR) + list(APPEND _lib_env "${ENV_PASTIX_LIBDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _lib_env "${ENV_PASTIX_DIR}") + list(APPEND _lib_env "${ENV_PASTIX_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the pastix lib in the given paths +# ------------------------------------------------ + +# create list of libs to find +set(PASTIX_libs_to_find "pastix_murge;pastix") + +# call cmake macro to find the lib path +if(PASTIX_LIBDIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_LIBDIR}) + endforeach() +else() + if(PASTIX_DIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(pastix_lib ${PASTIX_libs_to_find}) + + get_filename_component(${pastix_lib}_lib_path ${PASTIX_${pastix_lib}_LIBRARY} PATH) + # set cmake variables (respects naming convention) + if (PASTIX_LIBRARIES) + list(APPEND PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + else() + set(PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + endif() + if (PASTIX_LIBRARY_DIRS) + list(APPEND PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + else() + set(PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + endif() + mark_as_advanced(PASTIX_${pastix_lib}_LIBRARY) + +endforeach() + +# check a function to validate the find +if(PASTIX_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PASTIX + if (PASTIX_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PASTIX_INCLUDE_DIRS}") + endif() + foreach(libdir ${PASTIX_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + set(REQUIRED_LIBS "${PASTIX_LIBRARIES}") + # STARPU + if (PASTIX_LOOK_FOR_STARPU AND STARPU_FOUND) + if (STARPU_INCLUDE_DIRS_DEP) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS_DEP}") + elseif (STARPU_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS}") + endif() + if(STARPU_LIBRARY_DIRS_DEP) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS_DEP}") + elseif(STARPU_LIBRARY_DIRS) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS}") + endif() + if (STARPU_LIBRARIES_DEP) + list(APPEND REQUIRED_LIBS "${STARPU_LIBRARIES_DEP}") + elseif (STARPU_LIBRARIES) + foreach(lib ${STARPU_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + endif() + # CUDA + if (PASTIX_LOOK_FOR_STARPU_CUDA AND CUDA_FOUND) + if (CUDA_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${CUDA_INCLUDE_DIRS}") + endif() + foreach(libdir ${CUDA_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES}") + endif() + # MPI + if (PASTIX_LOOK_FOR_MPI AND MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND REQUIRED_INCDIRS "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # HWLOC + if (HWLOC_FOUND) + if (HWLOC_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + foreach(libdir ${HWLOC_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + foreach(lib ${HWLOC_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + # BLAS + if (BLAS_FOUND) + if (BLAS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${BLAS_INCLUDE_DIRS}") + endif() + foreach(libdir ${BLAS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${BLAS_LIBRARIES}") + if (BLAS_LINKER_FLAGS) + list(APPEND REQUIRED_LDFLAGS "${BLAS_LINKER_FLAGS}") + endif() + endif() + # SCOTCH + if (PASTIX_LOOK_FOR_SCOTCH AND SCOTCH_FOUND) + if (SCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${SCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + endif() + # PTSCOTCH + if (PASTIX_LOOK_FOR_PTSCOTCH AND PTSCOTCH_FOUND) + if (PTSCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${PTSCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + endif() + # METIS + if (PASTIX_LOOK_FOR_METIS AND METIS_FOUND) + if (METIS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + foreach(libdir ${METIS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${METIS_LIBRARIES}") + endif() + # Fortran + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_ifcore_LIBRARY}") + endif() + endif() + # EXTRA LIBS such that pthread, m, rt + list(APPEND REQUIRED_LIBS ${PASTIX_EXTRA_LIBRARIES}) + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PASTIX_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(pastix PASTIX_WORKS) + mark_as_advanced(PASTIX_WORKS) + + if(PASTIX_WORKS) + # save link with dependencies + set(PASTIX_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PASTIX_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PASTIX_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PASTIX_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PASTIX_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_LINKER_FLAGS) + else() + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX : test of pastix() fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + message(STATUS "Maybe PASTIX is linked with specific libraries. " + "Have you tried with COMPONENTS (MPI/SEQ, STARPU, STARPU_CUDA, SCOTCH, PTSCOTCH, METIS)? " + "See the explanation in FindPASTIX.cmake.") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif() + +if (PASTIX_LIBRARIES) + list(GET PASTIX_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PASTIX_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PASTIX library" FORCE) + else() + set(PASTIX_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PASTIX library" FORCE) + endif() +endif() +mark_as_advanced(PASTIX_DIR) +mark_as_advanced(PASTIX_DIR_FOUND) + +# check that PASTIX has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PASTIX DEFAULT_MSG + PASTIX_LIBRARIES + PASTIX_WORKS) diff --git a/externals/eigen/cmake/FindPTSCOTCH.cmake b/externals/eigen/cmake/FindPTSCOTCH.cmake new file mode 100644 index 00000000..6ccc743e --- /dev/null +++ b/externals/eigen/cmake/FindPTSCOTCH.cmake @@ -0,0 +1,422 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PTSCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PTSCOTCH +# [REQUIRED] # Fail with error if ptscotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PTSCOTCH depends on the following libraries: +# - Threads +# - MPI +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of PT-Scotch with the esmumps interface +# +# This module finds headers and ptscotch library. +# Results are reported in variables: +# PTSCOTCH_FOUND - True if headers and requested libraries were found +# PTSCOTCH_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PTSCOTCH_INCLUDE_DIRS - ptscotch include directories +# PTSCOTCH_LIBRARY_DIRS - Link directories for ptscotch libraries +# PTSCOTCH_LIBRARIES - ptscotch component libraries to be linked +# PTSCOTCH_INCLUDE_DIRS_DEP - ptscotch + dependencies include directories +# PTSCOTCH_LIBRARY_DIRS_DEP - ptscotch + dependencies link directories +# PTSCOTCH_LIBRARIES_DEP - ptscotch libraries + dependencies +# PTSCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPTSCOTCH=path/to/ptscotch): +# PTSCOTCH_DIR - Where to find the base directory of ptscotch +# PTSCOTCH_INCDIR - Where to find the header files +# PTSCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PTSCOTCH_DIR, PTSCOTCH_INCDIR, PTSCOTCH_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +if (NOT PTSCOTCH_FOUND) + set(PTSCOTCH_DIR "" CACHE PATH "Installation directory of PTSCOTCH library") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely PTSCOTCH_DIR, has been set to specify the install directory of PTSCOTCH") + endif() +endif() + +# Set the version to find +set(PTSCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( PTSCOTCH_FIND_COMPONENTS ) + foreach( component ${PTSCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(PTSCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# PTSCOTCH depends on Threads, try to find it +include(CMakeFindDependencyMacro) +if (NOT THREADS_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_dependency(Threads REQUIRED) + else() + find_dependency(Threads) + endif() +endif() + +# PTSCOTCH depends on MPI, try to find it +if (NOT MPI_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_dependency(MPI REQUIRED) + else() + find_dependency(MPI) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PTSCOTCH_DIR "$ENV{PTSCOTCH_DIR}") +set(ENV_PTSCOTCH_INCDIR "$ENV{PTSCOTCH_INCDIR}") +if(ENV_PTSCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_INCDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include/ptscotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the ptscotch header in the given paths +# ------------------------------------------------- + +set(PTSCOTCH_hdrs_to_find "ptscotch.h;scotch.h") + +# call cmake macro to find the header path +if(PTSCOTCH_INCDIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_INCDIR}) + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + else() + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + if (PTSCOTCH_${ptscotch_hdr}_DIRS) + list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}") + else () + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found") + endif() + endif() +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PTSCOTCH_LIBDIR "$ENV{PTSCOTCH_LIBDIR}") +if(ENV_PTSCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_LIBDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the ptscotch lib in the given paths +# ---------------------------------------------- + +set(PTSCOTCH_libs_to_find "ptscotch;ptscotcherr") +if (PTSCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT PTSCOTCH_libs_to_find 0 "ptesmumps") + list(APPEND PTSCOTCH_libs_to_find "esmumps" ) +endif() +list(APPEND PTSCOTCH_libs_to_find "scotch;scotcherr") + +# call cmake macro to find the lib path +if(PTSCOTCH_LIBDIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_LIBDIR}) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(PTSCOTCH_LIBRARIES "") +set(PTSCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + + if (PTSCOTCH_${ptscotch_lib}_LIBRARY) + get_filename_component(${ptscotch_lib}_lib_path "${PTSCOTCH_${ptscotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") + list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}") + else () + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found") + endif() + endif () + + mark_as_advanced(PTSCOTCH_${ptscotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(PTSCOTCH_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PTSCOTCH + if (PTSCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + if (PTSCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${PTSCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + # MPI + if (MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND CMAKE_REQUIRED_INCLUDES "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PTSCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_dgraphInit PTSCOTCH_WORKS) + mark_as_advanced(PTSCOTCH_WORKS) + + if(PTSCOTCH_WORKS) + # save link with dependencies + set(PTSCOTCH_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PTSCOTCH_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PTSCOTCH_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PTSCOTCH_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_LINKER_FLAGS) + else() + if(NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for PTSCOTCH : test of SCOTCH_dgraphInit with PTSCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif() + +if (PTSCOTCH_LIBRARIES) + list(GET PTSCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PTSCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + else() + set(PTSCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(PTSCOTCH_DIR) +mark_as_advanced(PTSCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${PTSCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(PTSCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(PTSCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_4}" PTSCOTCH_Num_4) +if(NOT PTSCOTCH_Num_4) + check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_8}" PTSCOTCH_Num_8) + if(NOT PTSCOTCH_Num_8) + set(PTSCOTCH_INTSIZE -1) + else() + set(PTSCOTCH_INTSIZE 8) + endif() +else() + set(PTSCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that PTSCOTCH has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PTSCOTCH DEFAULT_MSG + PTSCOTCH_LIBRARIES + PTSCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/externals/eigen/cmake/FindSCOTCH.cmake b/externals/eigen/cmake/FindSCOTCH.cmake new file mode 100644 index 00000000..11b971a9 --- /dev/null +++ b/externals/eigen/cmake/FindSCOTCH.cmake @@ -0,0 +1,370 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find SCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(SCOTCH +# [REQUIRED] # Fail with error if scotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of Scotch with the esmumps interface +# +# This module finds headers and scotch library. +# Results are reported in variables: +# SCOTCH_FOUND - True if headers and requested libraries were found +# SCOTCH_INCLUDE_DIRS - scotch include directories +# SCOTCH_LIBRARY_DIRS - Link directories for scotch libraries +# SCOTCH_LIBRARIES - scotch component libraries to be linked +# SCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DSCOTCH=path/to/scotch): +# SCOTCH_DIR - Where to find the base directory of scotch +# SCOTCH_INCDIR - Where to find the header files +# SCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: SCOTCH_DIR, SCOTCH_INCDIR, SCOTCH_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +if (NOT SCOTCH_FOUND) + set(SCOTCH_DIR "" CACHE PATH "Installation directory of SCOTCH library") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely SCOTCH_DIR, has been set to specify the install directory of SCOTCH") + endif() +endif() + +# Set the version to find +set(SCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( SCOTCH_FIND_COMPONENTS ) + foreach( component ${SCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(SCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# SCOTCH may depend on Threads, try to find it +include(CMakeFindDependencyMacro) +if (NOT THREADS_FOUND) + if (SCOTCH_FIND_REQUIRED) + find_dependency(Threads REQUIRED) + else() + find_dependency(Threads) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_SCOTCH_DIR "$ENV{SCOTCH_DIR}") +set(ENV_SCOTCH_INCDIR "$ENV{SCOTCH_INCDIR}") +if(ENV_SCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_SCOTCH_INCDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _inc_env "${ENV_SCOTCH_DIR}") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include/scotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the scotch header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(SCOTCH_INCDIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_INCDIR}) +else() + if(SCOTCH_DIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + else() + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + endif() +endif() +mark_as_advanced(SCOTCH_scotch.h_DIRS) + +# If found, add path to cmake variable +# ------------------------------------ +if (SCOTCH_scotch.h_DIRS) + set(SCOTCH_INCLUDE_DIRS "${SCOTCH_scotch.h_DIRS}") +else () + set(SCOTCH_INCLUDE_DIRS "SCOTCH_INCLUDE_DIRS-NOTFOUND") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- scotch.h not found") + endif() +endif() +list(REMOVE_DUPLICATES SCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_SCOTCH_LIBDIR "$ENV{SCOTCH_LIBDIR}") +if(ENV_SCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_SCOTCH_LIBDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _lib_env "${ENV_SCOTCH_DIR}") + list(APPEND _lib_env "${ENV_SCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the scotch lib in the given paths +# ---------------------------------------------- + +set(SCOTCH_libs_to_find "scotch;scotcherrexit") +if (SCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT SCOTCH_libs_to_find 0 "esmumps") +endif() + +# call cmake macro to find the lib path +if(SCOTCH_LIBDIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_LIBDIR}) + endforeach() +else() + if(SCOTCH_DIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(SCOTCH_LIBRARIES "") +set(SCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(scotch_lib ${SCOTCH_libs_to_find}) + + if (SCOTCH_${scotch_lib}_LIBRARY) + get_filename_component(${scotch_lib}_lib_path "${SCOTCH_${scotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + list(APPEND SCOTCH_LIBRARY_DIRS "${${scotch_lib}_lib_path}") + else () + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- lib ${scotch_lib} not found") + endif() + endif () + + mark_as_advanced(SCOTCH_${scotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES SCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(SCOTCH_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # SCOTCH + if (SCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + if (SCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${SCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(SCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_graphInit SCOTCH_WORKS) + mark_as_advanced(SCOTCH_WORKS) + + if(SCOTCH_WORKS) + # save link with dependencies + set(SCOTCH_LIBRARIES "${REQUIRED_LIBS}") + else() + if(NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for SCOTCH : test of SCOTCH_graphInit with SCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif() + +if (SCOTCH_LIBRARIES) + list(GET SCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(SCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + else() + set(SCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(SCOTCH_DIR) +mark_as_advanced(SCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${SCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(SCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(SCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_4}" SCOTCH_Num_4) +if(NOT SCOTCH_Num_4) + check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_8}" SCOTCH_Num_8) + if(NOT SCOTCH_Num_8) + set(SCOTCH_INTSIZE -1) + else() + set(SCOTCH_INTSIZE 8) + endif() +else() + set(SCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that SCOTCH has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SCOTCH DEFAULT_MSG + SCOTCH_LIBRARIES + SCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/externals/eigen/cmake/FindSPQR.cmake b/externals/eigen/cmake/FindSPQR.cmake new file mode 100644 index 00000000..d6fb2e13 --- /dev/null +++ b/externals/eigen/cmake/FindSPQR.cmake @@ -0,0 +1,41 @@ +# SPQR lib usually requires linking to a blas and lapack library. +# It is up to the user of this module to find a BLAS and link to it. + +# SPQR lib requires Cholmod, colamd and amd as well. +# FindCholmod.cmake can be used to find those packages before finding spqr + +if (SPQR_INCLUDES AND SPQR_LIBRARIES) + set(SPQR_FIND_QUIETLY TRUE) +endif () + +find_path(SPQR_INCLUDES + NAMES + SuiteSparseQR.hpp + PATHS + $ENV{SPQRDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +find_library(SPQR_LIBRARIES spqr $ENV{SPQRDIR} ${LIB_INSTALL_DIR}) + +if(SPQR_LIBRARIES) + + find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS $ENV{SPQRDIR} ${LIB_INSTALL_DIR}) + if (SUITESPARSE_LIBRARY) + set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${SUITESPARSE_LIBRARY}) + endif() + + find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + if(CHOLMOD_LIBRARY) + set(SPQR_LIBRARIES ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARY}) + endif() + +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SPQR DEFAULT_MSG SPQR_INCLUDES SPQR_LIBRARIES) + +mark_as_advanced(SPQR_INCLUDES SPQR_LIBRARIES) \ No newline at end of file diff --git a/externals/eigen/cmake/FindStandardMathLibrary.cmake b/externals/eigen/cmake/FindStandardMathLibrary.cmake new file mode 100644 index 00000000..1d1e5b3a --- /dev/null +++ b/externals/eigen/cmake/FindStandardMathLibrary.cmake @@ -0,0 +1,70 @@ +# - Try to find how to link to the standard math library, if anything at all is needed to do. +# On most platforms this is automatic, but for example it's not automatic on QNX. +# +# Once done this will define +# +# STANDARD_MATH_LIBRARY_FOUND - we found how to successfully link to the standard math library +# STANDARD_MATH_LIBRARY - the name of the standard library that one has to link to. +# -- this will be left empty if it's automatic (most platforms). +# -- this will be set to "m" on platforms where one must explicitly +# pass the "-lm" linker flag. +# +# Copyright (c) 2010 Benoit Jacob +# 2020 Susi Lehtola +# Redistribution and use is allowed according to the terms of the 2-clause BSD license. + + +include(CheckCXXSourceCompiles) + +# a little test program for c++ math functions. +# notice the std:: is required on some platforms such as QNX +# notice the (void) is required if -Wall (-Wunused-value) is added to CMAKE_CXX_FLAG + +# We read in the arguments from standard input to avoid the compiler optimizing away the calls +set(find_standard_math_library_test_program +" +#include +int main(int argc, char **){ + return int(std::sin(double(argc)) + std::log(double(argc))); +}") + +# first try compiling/linking the test program without any linker flags + +set(CMAKE_REQUIRED_FLAGS "") +set(CMAKE_REQUIRED_LIBRARIES "") +CHECK_CXX_SOURCE_COMPILES( + "${find_standard_math_library_test_program}" + standard_math_library_linked_to_automatically +) + +if(standard_math_library_linked_to_automatically) + + # the test program linked successfully without any linker flag. + set(STANDARD_MATH_LIBRARY "") + set(STANDARD_MATH_LIBRARY_FOUND TRUE) + +else() + + # the test program did not link successfully without any linker flag. + # This is a very uncommon case that so far we only saw on QNX. The next try is the + # standard name 'm' for the standard math library. + + set(CMAKE_REQUIRED_LIBRARIES "m") + CHECK_CXX_SOURCE_COMPILES( + "${find_standard_math_library_test_program}" + standard_math_library_linked_to_as_m) + + if(standard_math_library_linked_to_as_m) + + # the test program linked successfully when linking to the 'm' library + set(STANDARD_MATH_LIBRARY "m") + set(STANDARD_MATH_LIBRARY_FOUND TRUE) + + else() + + # the test program still doesn't link successfully + set(STANDARD_MATH_LIBRARY_FOUND FALSE) + + endif() + +endif() diff --git a/externals/eigen/cmake/FindSuperLU.cmake b/externals/eigen/cmake/FindSuperLU.cmake new file mode 100644 index 00000000..4b779f51 --- /dev/null +++ b/externals/eigen/cmake/FindSuperLU.cmake @@ -0,0 +1,97 @@ + +# Umfpack lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (SUPERLU_INCLUDES AND SUPERLU_LIBRARIES) + set(SUPERLU_FIND_QUIETLY TRUE) +endif () + +find_path(SUPERLU_INCLUDES + NAMES + supermatrix.h + PATHS + $ENV{SUPERLUDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + superlu + SRC +) + +find_library(SUPERLU_LIBRARIES + NAMES "superlu_5.2.1" "superlu_5.2" "superlu_5.1.1" "superlu_5.1" "superlu_5.0" "superlu_4.3" "superlu_4.2" "superlu_4.1" "superlu_4.0" "superlu_3.1" "superlu_3.0" "superlu" + PATHS $ENV{SUPERLUDIR} ${LIB_INSTALL_DIR} + PATH_SUFFIXES lib) + +if(SUPERLU_INCLUDES AND SUPERLU_LIBRARIES) + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) +cmake_push_check_state() + +set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} ${SUPERLU_INCLUDES}) + +# check whether struct mem_usage_t is globally defined +check_cxx_source_compiles(" +typedef int int_t; +#include +#include +int main() { + mem_usage_t mem; + return 0; +}" +SUPERLU_HAS_GLOBAL_MEM_USAGE_T) + + +check_cxx_source_compiles(" +typedef int int_t; +#include +#include +int main() { + return SLU_SINGLE; +}" +SUPERLU_HAS_CLEAN_ENUMS) + +check_cxx_source_compiles(" +typedef int int_t; +#include +#include +int main(void) +{ + GlobalLU_t glu; + return 0; +}" +SUPERLU_HAS_GLOBALLU_T) + +if(SUPERLU_HAS_GLOBALLU_T) + # at least 5.0 + set(SUPERLU_VERSION_VAR "5.0") +elseif(SUPERLU_HAS_CLEAN_ENUMS) + # at least 4.3 + set(SUPERLU_VERSION_VAR "4.3") +elseif(SUPERLU_HAS_GLOBAL_MEM_USAGE_T) + # at least 4.0 + set(SUPERLU_VERSION_VAR "4.0") +else() + set(SUPERLU_VERSION_VAR "3.0") +endif() + +cmake_pop_check_state() + +if(SuperLU_FIND_VERSION) + if(${SUPERLU_VERSION_VAR} VERSION_LESS ${SuperLU_FIND_VERSION}) + set(SUPERLU_VERSION_OK FALSE) + else() + set(SUPERLU_VERSION_OK TRUE) + endif() +else() + set(SUPERLU_VERSION_OK TRUE) +endif() + +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SuperLU + REQUIRED_VARS SUPERLU_INCLUDES SUPERLU_LIBRARIES SUPERLU_VERSION_OK + VERSION_VAR SUPERLU_VERSION_VAR) + +mark_as_advanced(SUPERLU_INCLUDES SUPERLU_LIBRARIES) diff --git a/externals/eigen/cmake/FindTriSYCL.cmake b/externals/eigen/cmake/FindTriSYCL.cmake new file mode 100644 index 00000000..81042390 --- /dev/null +++ b/externals/eigen/cmake/FindTriSYCL.cmake @@ -0,0 +1,173 @@ +#.rst: +# FindTriSYCL +#--------------- +# +# TODO : insert Copyright and licence + +######################### +# FindTriSYCL.cmake +######################### +# +# Tools for finding and building with TriSYCL. +# +# User must define TRISYCL_INCLUDE_DIR pointing to the triSYCL +# include directory. +# +# Latest version of this file can be found at: +# https://github.com/triSYCL/triSYCL + +# Requite CMake version 3.5 or higher +cmake_minimum_required (VERSION 3.5) + +# Check that a supported host compiler can be found +if(CMAKE_COMPILER_IS_GNUCXX) + # Require at least gcc 5.4 + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4) + message(FATAL_ERROR + "host compiler - Not found! (gcc version must be at least 5.4)") + else() + message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # Require at least clang 3.9 + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.9) + message(FATAL_ERROR + "host compiler - Not found! (clang version must be at least 3.9)") + else() + message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") + endif() +else() + message(WARNING + "host compiler - Not found! (triSYCL supports GCC and Clang)") +endif() + +#triSYCL options +option(TRISYCL_OPENMP "triSYCL multi-threading with OpenMP" ON) +option(TRISYCL_OPENCL "triSYCL OpenCL interoperability mode" OFF) +option(TRISYCL_NO_ASYNC "triSYCL use synchronous kernel execution" OFF) +option(TRISYCL_DEBUG "triSCYL use debug mode" OFF) +option(TRISYCL_DEBUG_STRUCTORS "triSYCL trace of object lifetimes" OFF) +option(TRISYCL_TRACE_KERNEL "triSYCL trace of kernel execution" OFF) + +mark_as_advanced(TRISYCL_OPENMP) +mark_as_advanced(TRISYCL_OPENCL) +mark_as_advanced(TRISYCL_NO_ASYNC) +mark_as_advanced(TRISYCL_DEBUG) +mark_as_advanced(TRISYCL_DEBUG_STRUCTORS) +mark_as_advanced(TRISYCL_TRACE_KERNEL) + +#triSYCL definitions +set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING + "Host language version to be used by trisYCL (default is: 220)") +set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING + "Device language version to be used by trisYCL (default is: 220)") +# triSYCL now requires c++17 +set(CMAKE_CXX_STANDARD 17) +set(CXX_STANDARD_REQUIRED ON) + + +# Find OpenCL package +include(CMakeFindDependencyMacro) +if(TRISYCL_OPENCL) + find_dependency(OpenCL REQUIRED) + if(UNIX) + set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH + "Path to Boost.Compute headers (default is: /usr/include/compute)") + endif() +endif() + +# Find OpenMP package +if(TRISYCL_OPENMP) + find_dependency(OpenMP REQUIRED) +endif() + +# Find Boost +find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log) + +# If debug or trace we need boost log +if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL) + set(LOG_NEEDED ON) +else() + set(LOG_NEEDED OFF) +endif() + +find_dependency(Threads REQUIRED) + +# Find triSYCL directory +if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES) + set(TRISYCL_FIND_QUIETLY TRUE) +endif () + +find_path(TRISYCL_INCLUDE_DIR + NAMES sycl.hpp + PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES triSYCL +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(TriSYCL DEFAULT_MSG + TRISYCL_INCLUDE_DIR) + +if(NOT TRISYCL_INCLUDE_DIR) + message(FATAL_ERROR + "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR") +else() + message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}") +endif() + +include(CMakeParseArguments) +####################### +# add_sycl_to_target +####################### +function(add_sycl_to_target) + set(options) + set(one_value_args + TARGET + ) + set(multi_value_args + SOURCES + ) + cmake_parse_arguments(ADD_SYCL_ARGS + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) + + # Add include directories to the "#include <>" paths + target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC + ${TRISYCL_INCLUDE_DIR} + ${Boost_INCLUDE_DIRS} + $<$:${OpenCL_INCLUDE_DIRS}> + $<$:${BOOST_COMPUTE_INCPATH}>) + + # Link dependencies + target_link_libraries(${ADD_SYCL_ARGS_TARGET} + $<$:${OpenCL_LIBRARIES}> + Threads::Threads + $<$:Boost::log> + Boost::chrono) + + # Compile definitions + target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC + EIGEN_SYCL_TRISYCL + $<$:TRISYCL_NO_ASYNC> + $<$:TRISYCL_OPENCL> + $<$:TRISYCL_DEBUG> + $<$:TRISYCL_DEBUG_STRUCTORS> + $<$:TRISYCL_TRACE_KERNEL> + $<$:BOOST_LOG_DYN_LINK>) + + # C++ and OpenMP requirements + target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC + ${TRISYCL_COMPILE_OPTIONS} + $<$:${OpenMP_CXX_FLAGS}>) + + if(${TRISYCL_OPENMP} AND (NOT WIN32)) + # Does not support generator expressions + set_target_properties(${ADD_SYCL_ARGS_TARGET} + PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) + endif() + +endfunction() diff --git a/externals/eigen/cmake/FindUMFPACK.cmake b/externals/eigen/cmake/FindUMFPACK.cmake new file mode 100644 index 00000000..91cf6372 --- /dev/null +++ b/externals/eigen/cmake/FindUMFPACK.cmake @@ -0,0 +1,53 @@ +# Umfpack lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (UMFPACK_INCLUDES AND UMFPACK_LIBRARIES) + set(UMFPACK_FIND_QUIETLY TRUE) +endif () + +find_path(UMFPACK_INCLUDES + NAMES + umfpack.h + PATHS + $ENV{UMFPACKDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +find_library(UMFPACK_LIBRARIES umfpack PATHS $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + +if(UMFPACK_LIBRARIES) + + if(NOT UMFPACK_LIBDIR) + get_filename_component(UMFPACK_LIBDIR ${UMFPACK_LIBRARIES} PATH) + endif() + + find_library(COLAMD_LIBRARY colamd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + if(COLAMD_LIBRARY) + set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${COLAMD_LIBRARY}) + endif () + + find_library(AMD_LIBRARY amd PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + if(AMD_LIBRARY) + set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${AMD_LIBRARY}) + endif () + + find_library(SUITESPARSE_LIBRARY SuiteSparse PATHS ${UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + if(SUITESPARSE_LIBRARY) + set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${SUITESPARSE_LIBRARY}) + endif () + + find_library(CHOLMOD_LIBRARY cholmod PATHS $ENV{UMFPACK_LIBDIR} $ENV{UMFPACKDIR} ${LIB_INSTALL_DIR}) + if(CHOLMOD_LIBRARY) + set(UMFPACK_LIBRARIES ${UMFPACK_LIBRARIES} ${CHOLMOD_LIBRARY}) + endif() + +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(UMFPACK DEFAULT_MSG + UMFPACK_INCLUDES UMFPACK_LIBRARIES) + +mark_as_advanced(UMFPACK_INCLUDES UMFPACK_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY CHOLMOD_LIBRARY SUITESPARSE_LIBRARY) diff --git a/externals/eigen/cmake/RegexUtils.cmake b/externals/eigen/cmake/RegexUtils.cmake new file mode 100644 index 00000000..f0a15248 --- /dev/null +++ b/externals/eigen/cmake/RegexUtils.cmake @@ -0,0 +1,19 @@ +function(escape_string_as_regex _str_out _str_in) + string(REGEX REPLACE "\\\\" "\\\\\\\\" FILETEST2 "${_str_in}") + string(REGEX REPLACE "([.$+*?|-])" "\\\\\\1" FILETEST2 "${FILETEST2}") + string(REGEX REPLACE "\\^" "\\\\^" FILETEST2 "${FILETEST2}") + string(REGEX REPLACE "\\(" "\\\\(" FILETEST2 "${FILETEST2}") + string(REGEX REPLACE "\\)" "\\\\)" FILETEST2 "${FILETEST2}") + string(REGEX REPLACE "\\[" "\\\\[" FILETEST2 "${FILETEST2}") + string(REGEX REPLACE "\\]" "\\\\]" FILETEST2 "${FILETEST2}") + set(${_str_out} "${FILETEST2}" PARENT_SCOPE) +endfunction() + +function(test_escape_string_as_regex) + set(test1 "\\.^$-+*()[]?|") + escape_string_as_regex(test2 "${test1}") + set(testRef "\\\\\\.\\^\\$\\-\\+\\*\\(\\)\\[\\]\\?\\|") + if(NOT test2 STREQUAL testRef) + message("Error in the escape_string_for_regex function : \n ${test1} was escaped as ${test2}, should be ${testRef}") + endif() +endfunction() \ No newline at end of file diff --git a/externals/eigen/cmake/UseEigen3.cmake b/externals/eigen/cmake/UseEigen3.cmake new file mode 100644 index 00000000..a38bac82 --- /dev/null +++ b/externals/eigen/cmake/UseEigen3.cmake @@ -0,0 +1,6 @@ +# -*- cmake -*- +# +# UseEigen3.cmake + +add_definitions ( ${EIGEN3_DEFINITIONS} ) +include_directories ( ${EIGEN3_INCLUDE_DIRS} ) diff --git a/externals/eigen/unsupported/CMakeLists.txt b/externals/eigen/unsupported/CMakeLists.txt deleted file mode 100644 index 4fef40a8..00000000 --- a/externals/eigen/unsupported/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_subdirectory(Eigen) -add_subdirectory(doc EXCLUDE_FROM_ALL) -if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest -else() - add_subdirectory(test EXCLUDE_FROM_ALL) -endif() diff --git a/externals/eigen/unsupported/Eigen/AdolcForward b/externals/eigen/unsupported/Eigen/AdolcForward deleted file mode 100644 index 15f5f073..00000000 --- a/externals/eigen/unsupported/Eigen/AdolcForward +++ /dev/null @@ -1,156 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ADLOC_FORWARD -#define EIGEN_ADLOC_FORWARD - -//-------------------------------------------------------------------------------- -// -// This file provides support for adolc's adouble type in forward mode. -// ADOL-C is a C++ automatic differentiation library, -// see https://projects.coin-or.org/ADOL-C for more information. -// -// Note that the maximal number of directions is controlled by -// the preprocessor token NUMBER_DIRECTIONS. The default is 2. -// -//-------------------------------------------------------------------------------- - -#define ADOLC_TAPELESS -#ifndef NUMBER_DIRECTIONS -# define NUMBER_DIRECTIONS 2 -#endif -#include - -// adolc defines some very stupid macros: -#if defined(malloc) -# undef malloc -#endif - -#if defined(calloc) -# undef calloc -#endif - -#if defined(realloc) -# undef realloc -#endif - -#include - -namespace Eigen { - -/** - * \defgroup AdolcForward_Module Adolc forward module - * This module provides support for adolc's adouble type in forward mode. - * ADOL-C is a C++ automatic differentiation library, - * see https://projects.coin-or.org/ADOL-C for more information. - * It mainly consists in: - * - a struct Eigen::NumTraits specialization - * - overloads of internal::* math function for adtl::adouble type. - * - * Note that the maximal number of directions is controlled by - * the preprocessor token NUMBER_DIRECTIONS. The default is 2. - * - * \code - * #include - * \endcode - */ - //@{ - -} // namespace Eigen - -// Eigen's require a few additional functions which must be defined in the same namespace -// than the custom scalar type own namespace -namespace adtl { - -inline const adouble& conj(const adouble& x) { return x; } -inline const adouble& real(const adouble& x) { return x; } -inline adouble imag(const adouble&) { return 0.; } -inline adouble abs(const adouble& x) { return fabs(x); } -inline adouble abs2(const adouble& x) { return x*x; } - -} - -namespace Eigen { - -template<> struct NumTraits - : NumTraits -{ - typedef adtl::adouble Real; - typedef adtl::adouble NonInteger; - typedef adtl::adouble Nested; - enum { - IsComplex = 0, - IsInteger = 0, - IsSigned = 1, - RequireInitialization = 1, - ReadCost = 1, - AddCost = 1, - MulCost = 1 - }; -}; - -template class AdolcForwardJacobian : public Functor -{ - typedef adtl::adouble ActiveScalar; -public: - - AdolcForwardJacobian() : Functor() {} - AdolcForwardJacobian(const Functor& f) : Functor(f) {} - - // forward constructors - template - AdolcForwardJacobian(const T0& a0) : Functor(a0) {} - template - AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} - template - AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {} - - typedef typename Functor::InputType InputType; - typedef typename Functor::ValueType ValueType; - typedef typename Functor::JacobianType JacobianType; - - typedef Matrix ActiveInput; - typedef Matrix ActiveValue; - - void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const - { - eigen_assert(v!=0); - if (!_jac) - { - Functor::operator()(x, v); - return; - } - - JacobianType& jac = *_jac; - - ActiveInput ax = x.template cast(); - ActiveValue av(jac.rows()); - - for (int j=0; j -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ALIGNED_VECTOR3 -#define EIGEN_ALIGNED_VECTOR3 - -#include - -namespace Eigen { - -/** - * \defgroup AlignedVector3_Module Aligned vector3 module - * - * \code - * #include - * \endcode - */ - //@{ - - -/** \class AlignedVector3 - * - * \brief A vectorization friendly 3D vector - * - * This class represents a 3D vector internally using a 4D vector - * such that vectorization can be seamlessly enabled. Of course, - * the same result can be achieved by directly using a 4D vector. - * This class makes this process simpler. - * - */ -// TODO specialize Cwise -template class AlignedVector3; - -namespace internal { -template struct traits > - : traits > -{ -}; -} - -template class AlignedVector3 - : public MatrixBase > -{ - typedef Matrix<_Scalar,4,1> CoeffType; - CoeffType m_coeffs; - public: - - typedef MatrixBase > Base; - EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3) - using Base::operator*; - - inline Index rows() const { return 3; } - inline Index cols() const { return 1; } - - Scalar* data() { return m_coeffs.data(); } - const Scalar* data() const { return m_coeffs.data(); } - Index innerStride() const { return 1; } - Index outerStride() const { return 3; } - - inline const Scalar& coeff(Index row, Index col) const - { return m_coeffs.coeff(row, col); } - - inline Scalar& coeffRef(Index row, Index col) - { return m_coeffs.coeffRef(row, col); } - - inline const Scalar& coeff(Index index) const - { return m_coeffs.coeff(index); } - - inline Scalar& coeffRef(Index index) - { return m_coeffs.coeffRef(index);} - - - inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) - : m_coeffs(x, y, z, Scalar(0)) - {} - - inline AlignedVector3(const AlignedVector3& other) - : Base(), m_coeffs(other.m_coeffs) - {} - - template - struct generic_assign_selector {}; - - template struct generic_assign_selector - { - inline static void run(AlignedVector3& dest, const XprType& src) - { - dest.m_coeffs = src; - } - }; - - template struct generic_assign_selector - { - inline static void run(AlignedVector3& dest, const XprType& src) - { - dest.m_coeffs.template head<3>() = src; - dest.m_coeffs.w() = Scalar(0); - } - }; - - template - inline AlignedVector3(const MatrixBase& other) - { - generic_assign_selector::run(*this,other.derived()); - } - - inline AlignedVector3& operator=(const AlignedVector3& other) - { m_coeffs = other.m_coeffs; return *this; } - - template - inline AlignedVector3& operator=(const MatrixBase& other) - { - generic_assign_selector::run(*this,other.derived()); - return *this; - } - - inline AlignedVector3 operator+(const AlignedVector3& other) const - { return AlignedVector3(m_coeffs + other.m_coeffs); } - - inline AlignedVector3& operator+=(const AlignedVector3& other) - { m_coeffs += other.m_coeffs; return *this; } - - inline AlignedVector3 operator-(const AlignedVector3& other) const - { return AlignedVector3(m_coeffs - other.m_coeffs); } - - inline AlignedVector3 operator-=(const AlignedVector3& other) - { m_coeffs -= other.m_coeffs; return *this; } - - inline AlignedVector3 operator*(const Scalar& s) const - { return AlignedVector3(m_coeffs * s); } - - inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec) - { return AlignedVector3(s * vec.m_coeffs); } - - inline AlignedVector3& operator*=(const Scalar& s) - { m_coeffs *= s; return *this; } - - inline AlignedVector3 operator/(const Scalar& s) const - { return AlignedVector3(m_coeffs / s); } - - inline AlignedVector3& operator/=(const Scalar& s) - { m_coeffs /= s; return *this; } - - inline Scalar dot(const AlignedVector3& other) const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - eigen_assert(other.m_coeffs.w()==Scalar(0)); - return m_coeffs.dot(other.m_coeffs); - } - - inline void normalize() - { - m_coeffs /= norm(); - } - - inline AlignedVector3 normalized() const - { - return AlignedVector3(m_coeffs / norm()); - } - - inline Scalar sum() const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - return m_coeffs.sum(); - } - - inline Scalar squaredNorm() const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - return m_coeffs.squaredNorm(); - } - - inline Scalar norm() const - { - using std::sqrt; - return sqrt(squaredNorm()); - } - - inline AlignedVector3 cross(const AlignedVector3& other) const - { - return AlignedVector3(m_coeffs.cross3(other.m_coeffs)); - } - - template - inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const - { - return m_coeffs.template head<3>().isApprox(other,eps); - } - - CoeffType& coeffs() { return m_coeffs; } - const CoeffType& coeffs() const { return m_coeffs; } -}; - -namespace internal { - -template -struct eval, Dense> -{ - typedef const AlignedVector3<_Scalar>& type; -}; - -template -struct evaluator > - : evaluator > -{ - typedef AlignedVector3 XprType; - typedef evaluator > Base; - - evaluator(const XprType &m) : Base(m.coeffs()) {} -}; - -} - -//@} - -} - -#endif // EIGEN_ALIGNED_VECTOR3 diff --git a/externals/eigen/unsupported/Eigen/ArpackSupport b/externals/eigen/unsupported/Eigen/ArpackSupport deleted file mode 100644 index 37a2799e..00000000 --- a/externals/eigen/unsupported/Eigen/ArpackSupport +++ /dev/null @@ -1,31 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ARPACKSUPPORT_MODULE_H -#define EIGEN_ARPACKSUPPORT_MODULE_H - -#include - -#include - -/** \defgroup ArpackSupport_Module Arpack support module - * - * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition. - * - * \code - * #include - * \endcode - */ - -#include -#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" - -#include - -#endif // EIGEN_ARPACKSUPPORT_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/externals/eigen/unsupported/Eigen/AutoDiff b/externals/eigen/unsupported/Eigen/AutoDiff deleted file mode 100644 index abf5b7d6..00000000 --- a/externals/eigen/unsupported/Eigen/AutoDiff +++ /dev/null @@ -1,40 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_AUTODIFF_MODULE -#define EIGEN_AUTODIFF_MODULE - -namespace Eigen { - -/** - * \defgroup AutoDiff_Module Auto Diff module - * - * This module features forward automatic differentation via a simple - * templated scalar type wrapper AutoDiffScalar. - * - * Warning : this should NOT be confused with numerical differentiation, which - * is a different method and has its own module in Eigen : \ref NumericalDiff_Module. - * - * \code - * #include - * \endcode - */ -//@{ - -} - -#include "src/AutoDiff/AutoDiffScalar.h" -// #include "src/AutoDiff/AutoDiffVector.h" -#include "src/AutoDiff/AutoDiffJacobian.h" - -namespace Eigen { -//@} -} - -#endif // EIGEN_AUTODIFF_MODULE diff --git a/externals/eigen/unsupported/Eigen/BVH b/externals/eigen/unsupported/Eigen/BVH deleted file mode 100644 index 0161a540..00000000 --- a/externals/eigen/unsupported/Eigen/BVH +++ /dev/null @@ -1,95 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Ilya Baran -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_BVH_MODULE_H -#define EIGEN_BVH_MODULE_H - -#include -#include -#include -#include -#include - -namespace Eigen { - -/** - * \defgroup BVH_Module BVH module - * \brief This module provides generic bounding volume hierarchy algorithms - * and reference tree implementations. - * - * - * \code - * #include - * \endcode - * - * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation - * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization - * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of - * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot - * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function - * over a volume is no greater than the minimum of a function over any object contained in it. - * - * Some sample queries that can be written in terms of intersection are: - * - Determine all points where a ray intersects a triangle mesh - * - Given a set of points, determine which are contained in a query sphere - * - Given a set of spheres, determine which contain the query point - * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$ - * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction) - * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set - * of points with itself) - * - * Some sample queries that can be written in terms of function minimization over a set of objects are: - * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray) - * - Given a polyline and a query point, determine the closest point on the polyline to the query - * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function) - * - Determine how far two meshes are from colliding (this is also a cartesian product query) - * - * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and - * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism - * for traversal. To abstract from the query, the query is responsible for keeping track of results. - * - * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code - typedef Volume //the type of bounding volume - typedef Object //the type of object in the hierarchy - typedef Index //a reference to a node in the hierarchy--typically an int or a pointer - typedef VolumeIterator //an iterator type over node children--returns Index - typedef ObjectIterator //an iterator over object (leaf) children--returns const Object & - Index getRootIndex() const //returns the index of the hierarchy root - const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index - void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, - ObjectIterator &outOBegin, ObjectIterator &outOEnd) const - //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children - //and [outOBegin, outOEnd) range over its object children - \endcode - * - * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector. - * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions: - * \code - bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume - bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately - \endcode - * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume - * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the - * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate. - * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation. - * - * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair: - * \include BVH_Example.cpp - * Output: \verbinclude BVH_Example.out - */ -} - -//@{ - -#include "src/BVH/BVAlgorithms.h" -#include "src/BVH/KdBVH.h" - -//@} - -#endif // EIGEN_BVH_MODULE_H diff --git a/externals/eigen/unsupported/Eigen/CMakeLists.txt b/externals/eigen/unsupported/Eigen/CMakeLists.txt deleted file mode 100644 index 631a0601..00000000 --- a/externals/eigen/unsupported/Eigen/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -set(Eigen_HEADERS - AdolcForward - AlignedVector3 - ArpackSupport - AutoDiff - BVH - EulerAngles - FFT - IterativeSolvers - KroneckerProduct - LevenbergMarquardt - MatrixFunctions - MoreVectorization - MPRealSupport - NonLinearOptimization - NumericalDiff - OpenGLSupport - Polynomials - Skyline - SparseExtra - SpecialFunctions - Splines - ) - -install(FILES - ${Eigen_HEADERS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") - -add_subdirectory(CXX11) diff --git a/externals/eigen/unsupported/Eigen/CXX11/CMakeLists.txt b/externals/eigen/unsupported/Eigen/CXX11/CMakeLists.txt deleted file mode 100644 index 385ed240..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) - -install(FILES - ${Eigen_CXX11_HEADERS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/externals/eigen/unsupported/Eigen/CXX11/Tensor b/externals/eigen/unsupported/Eigen/CXX11/Tensor deleted file mode 100644 index 7ecb4c74..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/Tensor +++ /dev/null @@ -1,152 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -//#ifndef EIGEN_CXX11_TENSOR_MODULE -//#define EIGEN_CXX11_TENSOR_MODULE - -#include "../../../Eigen/Core" - -#ifdef EIGEN_USE_SYCL -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include -#include -#include -#include -#endif - -#include - -#include "../SpecialFunctions" -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -/** \defgroup CXX11_Tensor_Module Tensor Module - * - * This module provides a Tensor class for storing arbitrarily indexed - * objects. - * - * \code - * #include - * \endcode - */ - -#include -#include -#include - -#ifdef _WIN32 -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#else -#include -#endif - -#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 -#include -#endif - -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - -#ifdef EIGEN_USE_THREADS -#include "ThreadPool" -#endif - -#ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif -#endif - -#include "src/Tensor/TensorMacros.h" -#include "src/Tensor/TensorForwardDeclarations.h" -#include "src/Tensor/TensorMeta.h" -#include "src/Tensor/TensorFunctors.h" -#include "src/Tensor/TensorCostModel.h" -#include "src/Tensor/TensorDeviceDefault.h" -#include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" -#include "src/Tensor/TensorDeviceSycl.h" -#include "src/Tensor/TensorIndexList.h" -#include "src/Tensor/TensorDimensionList.h" -#include "src/Tensor/TensorDimensions.h" -#include "src/Tensor/TensorInitializer.h" -#include "src/Tensor/TensorTraits.h" -#include "src/Tensor/TensorRandom.h" -#include "src/Tensor/TensorUInt128.h" -#include "src/Tensor/TensorIntDiv.h" -#include "src/Tensor/TensorGlobalFunctions.h" - -#include "src/Tensor/TensorBase.h" - -#include "src/Tensor/TensorEvaluator.h" -#include "src/Tensor/TensorExpr.h" -#include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" -#include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorConcatenation.h" -#include "src/Tensor/TensorContractionMapper.h" -#include "src/Tensor/TensorContractionBlocking.h" -#include "src/Tensor/TensorContraction.h" -#include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" -#include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" -#include "src/Tensor/TensorFFT.h" -#include "src/Tensor/TensorPatch.h" -#include "src/Tensor/TensorImagePatch.h" -#include "src/Tensor/TensorVolumePatch.h" -#include "src/Tensor/TensorBroadcasting.h" -#include "src/Tensor/TensorChipping.h" -#include "src/Tensor/TensorInflation.h" -#include "src/Tensor/TensorLayoutSwap.h" -#include "src/Tensor/TensorMorphing.h" -#include "src/Tensor/TensorPadding.h" -#include "src/Tensor/TensorReverse.h" -#include "src/Tensor/TensorShuffling.h" -#include "src/Tensor/TensorStriding.h" -#include "src/Tensor/TensorCustomOp.h" -#include "src/Tensor/TensorEvalTo.h" -#include "src/Tensor/TensorForcedEval.h" -#include "src/Tensor/TensorGenerator.h" -#include "src/Tensor/TensorAssign.h" -#include "src/Tensor/TensorScan.h" - -#include "src/Tensor/TensorSycl.h" -#include "src/Tensor/TensorExecutor.h" -#include "src/Tensor/TensorDevice.h" - -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" -#include "src/Tensor/TensorFixedSize.h" -#include "src/Tensor/TensorMap.h" -#include "src/Tensor/TensorRef.h" - -#include "src/Tensor/TensorIO.h" - -#include - -//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/externals/eigen/unsupported/Eigen/CXX11/TensorSymmetry b/externals/eigen/unsupported/Eigen/CXX11/TensorSymmetry deleted file mode 100644 index fb1b0c0f..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/TensorSymmetry +++ /dev/null @@ -1,42 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE -#define EIGEN_CXX11_TENSORSYMMETRY_MODULE - -#include - -#include - -#include "src/util/CXX11Meta.h" - -/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module - * - * This module provides a classes that allow for the definition of - * symmetries w.r.t. tensor indices. - * - * Including this module will implicitly include the Tensor module. - * - * \code - * #include - * \endcode - */ - -#include "src/TensorSymmetry/util/TemplateGroupTheory.h" -#include "src/TensorSymmetry/Symmetry.h" -#include "src/TensorSymmetry/StaticSymmetry.h" -#include "src/TensorSymmetry/DynamicSymmetry.h" - -#include - -#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/externals/eigen/unsupported/Eigen/CXX11/ThreadPool b/externals/eigen/unsupported/Eigen/CXX11/ThreadPool deleted file mode 100644 index 09d637e9..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/ThreadPool +++ /dev/null @@ -1,65 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_MODULE -#define EIGEN_CXX11_THREADPOOL_MODULE - -#include "../../../Eigen/Core" - -#include - -/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module - * - * This module provides 2 threadpool implementations - * - a simple reference implementation - * - a faster non blocking implementation - * - * This module requires C++11. - * - * \code - * #include - * \endcode - */ - - -// The code depends on CXX11, so only include the module if the -// compiler supports it. -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -#include "src/ThreadPool/ThreadLocal.h" -#include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/EventCount.h" -#include "src/ThreadPool/RunQueue.h" -#include "src/ThreadPool/ThreadPoolInterface.h" -#include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/SimpleThreadPool.h" -#include "src/ThreadPool/NonBlockingThreadPool.h" - -#endif - -#include - -#endif // EIGEN_CXX11_THREADPOOL_MODULE - diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md deleted file mode 100644 index 02146527..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +++ /dev/null @@ -1,1757 +0,0 @@ -# Eigen Tensors - -Tensors are multidimensional arrays of elements. Elements are typically scalars, -but more complex types such as strings are also supported. - -[TOC] - -## Tensor Classes - -You can manipulate a tensor with one of the following classes. They all are in -the namespace ```::Eigen.``` - - -### Class Tensor - -This is the class to use to create a tensor and allocate memory for it. The -class is templatized with the tensor datatype, such as float or int, and the -tensor rank. The rank is the number of dimensions, for example rank 2 is a -matrix. - -Tensors of this class are resizable. For example, if you assign a tensor of a -different size to a Tensor, that tensor is resized to match its new value. - -#### Constructor Tensor(size0, size1, ...) - -Constructor for a Tensor. The constructor must be passed ```rank``` integers -indicating the sizes of the instance along each of the the ```rank``` -dimensions. - - // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns - // memory to hold 24 floating point values (24 = 2 x 3 x 4). - Tensor t_3d(2, 3, 4); - - // Resize t_3d by assigning a tensor of different sizes, but same rank. - t_3d = Tensor(3, 4, 3); - -#### Constructor Tensor(size_array) - -Constructor where the sizes for the constructor are specified as an array of -values instead of an explicitly list of parameters. The array type to use is -```Eigen::array```. The array can be constructed automatically -from an initializer list. - - // Create a tensor of strings of rank 2 with sizes 5, 7. - Tensor t_2d({5, 7}); - - -### Class TensorFixedSize> - -Class to use for tensors of fixed size, where the size is known at compile -time. Fixed sized tensors can provide very fast computations because all their -dimensions are known by the compiler. FixedSize tensors are not resizable. - -If the total number of elements in a fixed size tensor is small enough the -tensor data is held onto the stack and does not cause heap allocation and free. - - // Create a 4 x 3 tensor of floats. - TensorFixedSize> t_4x3; - -### Class TensorMap> - -This is the class to use to create a tensor on top of memory allocated and -owned by another part of your code. It allows to view any piece of allocated -memory as a Tensor. Instances of this class do not own the memory where the -data are stored. - -A TensorMap is not resizable because it does not own the memory where its data -are stored. - -#### Constructor TensorMap>(data, size0, size1, ...) - -Constructor for a Tensor. The constructor must be passed a pointer to the -storage for the data, and "rank" size attributes. The storage has to be -large enough to hold all the data. - - // Map a tensor of ints on top of stack-allocated storage. - int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap t_4d(storage, 2, 4, 2, 8); - - // The same storage can be viewed as a different tensor. - // You can also pass the sizes as an array. - TensorMap t_2d(storage, 16, 8); - - // You can also map fixed-size tensors. Here we get a 1d view of - // the 2d fixed-size tensor. - Tensor> t_4x3; - TensorMap t_12(t_4x3, 12); - - -#### Class TensorRef - -See Assigning to a TensorRef below. - -## Accessing Tensor Elements - -#### tensor(index0, index1...) - -Return the element at position ```(index0, index1...)``` in tensor -```tensor```. You must pass as many parameters as the rank of ```tensor```. -The expression can be used as an l-value to set the value of the element at the -specified position. The value returned is of the datatype of the tensor. - - // Set the value of the element at position (0, 1, 0); - Tensor t_3d(2, 3, 4); - t_3d(0, 1, 0) = 12.0f; - - // Initialize all elements to random values. - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 4; ++k) { - t_3d(i, j, k) = ...some random value...; - } - } - } - - // Print elements of a tensor. - for (int i = 0; i < 2; ++i) { - LOG(INFO) << t_3d(i, 0, 0); - } - - -## TensorLayout - -The tensor library supports 2 layouts: ```ColMajor``` (the default) and -```RowMajor```. Only the default column major layout is currently fully -supported, and it is therefore not recommended to attempt to use the row major -layout at the moment. - -The layout of a tensor is optionally specified as part of its type. If not -specified explicitly column major is assumed. - - Tensor col_major; // equivalent to Tensor - TensorMap > row_major(data, ...); - -All the arguments to an expression must use the same layout. Attempting to mix -different layouts will result in a compilation error. - -It is possible to change the layout of a tensor or an expression using the -```swap_layout()``` method. Note that this will also reverse the order of the -dimensions. - - Tensor col_major(2, 4); - Tensor row_major(2, 4); - - Tensor col_major_result = col_major; // ok, layouts match - Tensor col_major_result = row_major; // will not compile - - // Simple layout swap - col_major_result = row_major.swap_layout(); - eigen_assert(col_major_result.dimension(0) == 4); - eigen_assert(col_major_result.dimension(1) == 2); - - // Swap the layout and preserve the order of the dimensions - array shuffle(1, 0); - col_major_result = row_major.swap_layout().shuffle(shuffle); - eigen_assert(col_major_result.dimension(0) == 2); - eigen_assert(col_major_result.dimension(1) == 4); - - -## Tensor Operations - -The Eigen Tensor library provides a vast library of operations on Tensors: -numerical operations such as addition and multiplication, geometry operations -such as slicing and shuffling, etc. These operations are available as methods -of the Tensor classes, and in some cases as operator overloads. For example -the following code computes the elementwise addition of two tensors: - - Tensor t1(2, 3, 4); - ...set some values in t1... - Tensor t2(2, 3, 4); - ...set some values in t2... - // Set t3 to the element wise sum of t1 and t2 - Tensor t3 = t1 + t2; - -While the code above looks easy enough, it is important to understand that the -expression ```t1 + t2``` is not actually adding the values of the tensors. The -expression instead constructs a "tensor operator" object of the class -TensorCwiseBinaryOp, which has references to the tensors -```t1``` and ```t2```. This is a small C++ object that knows how to add -```t1``` and ```t2```. It is only when the value of the expression is assigned -to the tensor ```t3``` that the addition is actually performed. Technically, -this happens through the overloading of ```operator=()``` in the Tensor class. - -This mechanism for computing tensor expressions allows for lazy evaluation and -optimizations which are what make the tensor library very fast. - -Of course, the tensor operators do nest, and the expression ```t1 + t2 * -0.3f``` is actually represented with the (approximate) tree of operators: - - TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) - - -### Tensor Operations and C++ "auto" - -Because Tensor operations create tensor operators, the C++ ```auto``` keyword -does not have its intuitive meaning. Consider these 2 lines of code: - - Tensor t3 = t1 + t2; - auto t4 = t1 + t2; - -In the first line we allocate the tensor ```t3``` and it will contain the -result of the addition of ```t1``` and ```t2```. In the second line, ```t4``` -is actually the tree of tensor operators that will compute the addition of -```t1``` and ```t2```. In fact, ```t4``` is *not* a tensor and you cannot get -the values of its elements: - - Tensor t3 = t1 + t2; - cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) - - auto t4 = t1 + t2; - cout << t4(0, 0, 0); // Compilation error! - -When you use ```auto``` you do not get a Tensor as a result but instead a -non-evaluated expression. So only use ```auto``` to delay evaluation. - -Unfortunately, there is no single underlying concrete type for holding -non-evaluated expressions, hence you have to use auto in the case when you do -want to hold non-evaluated expressions. - -When you need the results of set of tensor computations you have to assign the -result to a Tensor that will be capable of holding onto them. This can be -either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing -piece of memory. All the following will work: - - auto t4 = t1 + t2; - - Tensor result = t4; // Could also be: result(t4); - cout << result(0, 0, 0); - - TensorMap result(, , ...) = t4; - cout << result(0, 0, 0); - - TensorFixedSize> result = t4; - cout << result(0, 0, 0); - -Until you need the results, you can keep the operation around, and even reuse -it for additional operations. As long as you keep the expression as an -operation, no computation is performed. - - // One way to compute exp((t1 + t2) * 0.2f); - auto t3 = t1 + t2; - auto t4 = t3 * 0.2f; - auto t5 = t4.exp(); - Tensor result = t5; - - // Another way, exactly as efficient as the previous one: - Tensor result = ((t1 + t2) * 0.2f).exp(); - -### Controlling When Expression are Evaluated - -There are several ways to control when expressions are evaluated: - -* Assignment to a Tensor, TensorFixedSize, or TensorMap. -* Use of the eval() method. -* Assignment to a TensorRef. - -#### Assigning to a Tensor, TensorFixedSize, or TensorMap. - -The most common way to evaluate an expression is to assign it to a Tensor. In -the example below, the ```auto``` declarations make the intermediate values -"Operations", not Tensors, and do not cause the expressions to be evaluated. -The assignment to the Tensor ```result``` causes the evaluation of all the -operations. - - auto t3 = t1 + t2; // t3 is an Operation. - auto t4 = t3 * 0.2f; // t4 is an Operation. - auto t5 = t4.exp(); // t5 is an Operation. - Tensor result = t5; // The operations are evaluated. - -If you know the ranks and sizes of the Operation value you can assign the -Operation to a TensorFixedSize instead of a Tensor, which is a bit more -efficient. - - // We know that the result is a 4x4x2 tensor! - TensorFixedSize result = t5; - -Simiarly, assigning an expression to a TensorMap causes its evaluation. Like -tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to -have the rank and sizes of the expression that are assigned to them. - -#### Calling eval(). - -When you compute large composite expressions, you sometimes want to tell Eigen -that an intermediate value in the expression tree is worth evaluating ahead of -time. This is done by inserting a call to the ```eval()``` method of the -expression Operation. - - // The previous example could have been written: - Tensor result = ((t1 + t2) * 0.2f).exp(); - - // If you want to compute (t1 + t2) once ahead of time you can write: - Tensor result = ((t1 + t2).eval() * 0.2f).exp(); - -Semantically, calling ```eval()``` is equivalent to materializing the value of -the expression in a temporary Tensor of the right size. The code above in -effect does: - - // .eval() knows the size! - TensorFixedSize tmp = t1 + t2; - Tensor result = (tmp * 0.2f).exp(); - -Note that the return value of ```eval()``` is itself an Operation, so the -following code does not do what you may think: - - // Here t3 is an evaluation Operation. t3 has not been evaluated yet. - auto t3 = (t1 + t2).eval(); - - // You can use t3 in another expression. Still no evaluation. - auto t4 = (t3 * 0.2f).exp(); - - // The value is evaluated when you assign the Operation to a Tensor, using - // an intermediate tensor to represent t3.x - Tensor result = t4; - -While in the examples above calling ```eval()``` does not make a difference in -performance, in other cases it can make a huge difference. In the expression -below the ```broadcast()``` expression causes the ```X.maximum()``` expression -to be evaluated many times: - - Tensor<...> X ...; - Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -Inserting a call to ```eval()``` between the ```maximum()``` and -```reshape()``` calls guarantees that maximum() is only computed once and -greatly speeds-up execution: - - Tensor<...> Y = - ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -In the other example below, the tensor ```Y``` is both used in the expression -and its assignment. This is an aliasing problem and if the evaluation is not -done in the right order Y will be updated incrementally during the evaluation -resulting in bogus results: - - Tensor<...> Y ...; - Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); - -Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()``` -expressions ensures that the sum is computed before any updates to ```Y``` are -done. - - Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); - -Note that an eval around the full right hand side expression is not needed -because the generated has to compute the i-th value of the right hand side -before assigning it to the left hand side. - -However, if you were assigning the expression value to a shuffle of ```Y``` -then you would need to force an eval for correctness by adding an ```eval()``` -call for the right hand side: - - Y.shuffle(...) = - (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); - - -#### Assigning to a TensorRef. - -If you need to access only a few elements from the value of an expression you -can avoid materializing the value in a full tensor by using a TensorRef. - -A TensorRef is a small wrapper class for any Eigen Operation. It provides -overloads for the ```()``` operator that let you access individual values in -the expression. TensorRef is convenient, because the Operation themselves do -not provide a way to access individual elements. - - // Create a TensorRef for the expression. The expression is not - // evaluated yet. - TensorRef > ref = ((t1 + t2) * 0.2f).exp(); - - // Use "ref" to access individual elements. The expression is evaluated - // on the fly. - float at_0 = ref(0, 0, 0); - cout << ref(0, 1, 0); - -Only use TensorRef when you need a subset of the values of the expression. -TensorRef only computes the values you access. However note that if you are -going to access all the values it will be much faster to materialize the -results in a Tensor first. - -In some cases, if the full Tensor result would be very large, you may save -memory by accessing it as a TensorRef. But not always. So don't count on it. - - -### Controlling How Expressions Are Evaluated - -The tensor library provides several implementations of the various operations -such as contractions and convolutions. The implementations are optimized for -different environments: single threaded on CPU, multi threaded on CPU, or on a -GPU using cuda. Additional implementations may be added later. - -You can choose which implementation to use with the ```device()``` call. If -you do not choose an implementation explicitly the default implementation that -uses a single thread on the CPU is used. - -The default implementation has been optimized for recent Intel CPUs, taking -advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the -library on ARM CPUs. Note that you need to pass compiler-dependent flags -to enable the use of SSE, AVX, and other instructions. - -For example, the following code adds two tensors using the default -single-threaded CPU implementation: - - Tensor a(30, 40); - Tensor b(30, 40); - Tensor c = a + b; - -To choose a different implementation you have to insert a ```device()``` call -before the assignment of the result. For technical C++ reasons this requires -that the Tensor for the result be declared on its own. This means that you -have to know the size of the result. - - Eigen::Tensor c(30, 40); - c.device(...) = a + b; - -The call to ```device()``` must be the last call on the left of the operator=. - -You must pass to the ```device()``` call an Eigen device object. There are -presently three devices you can use: DefaultDevice, ThreadPoolDevice and -GpuDevice. - - -#### Evaluating With the DefaultDevice - -This is exactly the same as not inserting a ```device()``` call. - - DefaultDevice my_device; - c.device(my_device) = a + b; - -#### Evaluating with a Thread Pool - - // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); - - // Now just use the device when evaluating expressions. - Eigen::Tensor c(30, 50); - c.device(my_device) = a.contract(b, dot_product_dims); - - -#### Evaluating On GPU - -This is presently a bit more complicated than just using a thread pool device. -You need to create a GPU device but you also need to explicitly allocate the -memory for tensors with cuda. - - -## API Reference - -### Datatypes - -In the documentation of the tensor methods and Operation we mention datatypes -that are tensor-type specific: - -#### ::Dimensions - -Acts like an array of ints. Has an ```int size``` attribute, and can be -indexed like an array to access individual values. Used to represent the -dimensions of a tensor. See ```dimensions()```. - -#### ::Index - -Acts like an ```int```. Used for indexing tensors along their dimensions. See -```operator()```, ```dimension()```, and ```size()```. - -#### ::Scalar - -Represents the datatype of individual tensor elements. For example, for a -```Tensor```, ```Scalar``` is the type ```float```. See -```setConstant()```. - -#### - -We use this pseudo type to indicate that a tensor Operation is returned by a -method. We indicate in the text the type and dimensions of the tensor that the -Operation returns after evaluation. - -The Operation will have to be evaluated, for example by assigning it to a -tensor, before you can access the values of the resulting tensor. You can also -access the values through a TensorRef. - - -## Built-in Tensor Methods - -These are usual C++ methods that act on tensors immediately. They are not -Operations which provide delayed evaluation of their results. Unless specified -otherwise, all the methods listed below are available on all tensor classes: -Tensor, TensorFixedSize, and TensorMap. - -## Metadata - -### int NumDimensions - -Constant value indicating the number of dimensions of a Tensor. This is also -known as the tensor "rank". - - Eigen::Tensor a(3, 4); - cout << "Dims " << a.NumDimensions; - => Dims 2 - -### Dimensions dimensions() - -Returns an array-like object representing the dimensions of the tensor. -The actual type of the dimensions() result is ::Dimensions. - - Eigen::Tensor a(3, 4); - const Eigen::Tensor::Dimensions& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -If you use a C++11 compiler, you can use ```auto``` to simplify the code: - - const auto& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -### Index dimension(Index n) - -Returns the n-th dimension of the tensor. The actual type of the -```dimension()``` result is ```::Index```, but you can -always use it like an int. - - Eigen::Tensor a(3, 4); - int dim1 = a.dimension(1); - cout << "Dim 1: " << dim1; - => Dim 1: 4 - -### Index size() - -Returns the total number of elements in the tensor. This is the product of all -the tensor dimensions. The actual type of the ```size()``` result is -```::Index```, but you can always use it like an int. - - Eigen::Tensor a(3, 4); - cout << "Size: " << a.size(); - => Size: 12 - - -### Getting Dimensions From An Operation - -A few operations provide ```dimensions()``` directly, -e.g. ```TensorReslicingOp```. Most operations defer calculating dimensions -until the operation is being evaluated. If you need access to the dimensions -of a deferred operation, you can wrap it in a TensorRef (see Assigning to a -TensorRef above), which provides ```dimensions()``` and ```dimension()``` as -above. - -TensorRef can also wrap the plain Tensor types, so this is a useful idiom in -templated contexts where the underlying object could be either a raw Tensor -or some deferred operation (e.g. a slice of a Tensor). In this case, the -template code can wrap the object in a TensorRef and reason about its -dimensionality while remaining agnostic to the underlying type. - - -## Constructors - -### Tensor - -Creates a tensor of the specified size. The number of arguments must be equal -to the rank of the tensor. The content of the tensor is not initialized. - - Eigen::Tensor a(3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorFixedSize - -Creates a tensor of the specified size. The number of arguments in the Size<> -template parameter determines the rank of the tensor. The content of the tensor -is not initialized. - - Eigen::TensorFixedSize> a; - cout << "Rank: " << a.rank() << endl; - => Rank: 2 - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorMap - -Creates a tensor mapping an existing array of data. The data must not be freed -until the TensorMap is discarded, and the size of the data must be large enough -to accomodate of the coefficients of the tensor. - - float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap a(data, 3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 9 - - -## Contents Initialization - -When a new Tensor or a new TensorFixedSize are created, memory is allocated to -hold all the tensor elements, but the memory is not initialized. Similarly, -when a new TensorMap is created on top of non-initialized memory the memory its -contents are not initialized. - -You can use one of the methods below to initialize the tensor memory. These -have an immediate effect on the tensor and return the tensor itself as a -result. These are not tensor Operations which delay evaluation. - -### setConstant(const Scalar& val) - -Sets all elements of the tensor to the constant value ```val```. ```Scalar``` -is the type of data stored in the tensor. You can pass any value that is -convertible to that type. - -Returns the tensor itself in case you want to chain another call. - - a.setConstant(12.3f); - cout << "Constant: " << endl << a << endl << endl; - => - Constant: - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - -Note that ```setConstant()``` can be used on any tensor where the element type -has a copy constructor and an ```operator=()```: - - Eigen::Tensor a(2, 3); - a.setConstant("yolo"); - cout << "String tensor: " << endl << a << endl << endl; - => - String tensor: - yolo yolo yolo - yolo yolo yolo - - -### setZero() - -Fills the tensor with zeros. Equivalent to ```setConstant(Scalar(0))```. -Returns the tensor itself in case you want to chain another call. - - a.setZero(); - cout << "Zeros: " << endl << a << endl << endl; - => - Zeros: - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### setValues({..initializer_list}) - -Fills the tensor with explicit values specified in a std::initializer_list. -The type of the initializer list depends on the type and rank of the tensor. - -If the tensor has rank N, the initializer list must be nested N times. The -most deeply nested lists must contains P scalars of the Tensor type where P is -the size of the last dimension of the Tensor. - -For example, for a ```TensorFixedSize``` the initializer list must -contains 2 lists of 3 floats each. - -```setValues()``` returns the tensor itself in case you want to chain another -call. - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); - cout << "a" << endl << a << endl << endl; - => - a - 0 1 2 - 3 4 5 - -If a list is too short, the corresponding elements of the tensor will not be -changed. This is valid at each level of nesting. For example the following -code only sets the values of the first row of the tensor. - - Eigen::Tensor a(2, 3); - a.setConstant(1000); - a.setValues({{10, 20, 30}}); - cout << "a" << endl << a << endl << endl; - => - a - 10 20 30 - 1000 1000 1000 - -### setRandom() - -Fills the tensor with random values. Returns the tensor itself in case you -want to chain another call. - - a.setRandom(); - cout << "Random: " << endl << a << endl << endl; - => - Random: - 0.680375 0.59688 -0.329554 0.10794 - -0.211234 0.823295 0.536459 -0.0452059 - 0.566198 -0.604897 -0.444451 0.257742 - -You can customize ```setRandom()``` by providing your own random number -generator as a template argument: - - a.setRandom(); - -Here, ```MyRandomGenerator``` must be a struct with the following member -functions, where Scalar and Index are the same as ```::Scalar``` -and ```::Index```. - -See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example. - - // Custom number generator for use with setRandom(). - struct MyRandomGenerator { - // Default and copy constructors. Both are needed - MyRandomGenerator() { } - MyRandomGenerator(const MyRandomGenerator& ) { } - - // Return a random value to be used. "element_location" is the - // location of the entry to set in the tensor, it can typically - // be ignored. - Scalar operator()(Eigen::DenseIndex element_location, - Eigen::DenseIndex /*unused*/ = 0) const { - return ; - } - - // Same as above but generates several numbers at a time. - typename internal::packet_traits::type packetOp( - Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { - return ; - } - }; - -You can also use one of the 2 random number generators that are part of the -tensor library: -* UniformRandomGenerator -* NormalRandomGenerator - - -## Data Access - -The Tensor, TensorFixedSize, and TensorRef classes provide the following -accessors to access the tensor coefficients: - - const Scalar& operator()(const array& indices) - const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - Scalar& operator()(const array& indices) - Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - -The number of indices must be equal to the rank of the tensor. Moreover, these -accessors are not available on tensor expressions. In order to access the -values of a tensor expression, the expression must either be evaluated or -wrapped in a TensorRef. - - -### Scalar* data() and const Scalar* data() const - -Returns a pointer to the storage for the tensor. The pointer is const if the -tensor was const. This allows direct access to the data. The layout of the -data depends on the tensor layout: RowMajor or ColMajor. - -This access is usually only needed for special cases, for example when mixing -Eigen Tensor code with other libraries. - -Scalar is the type of data stored in the tensor. - - Eigen::Tensor a(3, 4); - float* a_data = a.data(); - a_data[0] = 123.45f; - cout << "a(0, 0): " << a(0, 0); - => a(0, 0): 123.45 - - -## Tensor Operations - -All the methods documented below return non evaluated tensor ```Operations```. -These can be chained: you can apply another Tensor Operation to the value -returned by the method. - -The chain of Operation is evaluated lazily, typically when it is assigned to a -tensor. See "Controlling when Expression are Evaluated" for more details about -their evaluation. - -### constant(const Scalar& val) - -Returns a tensor of the same type and dimensions as the original tensor but -where all elements have the value ```val```. - -This is useful, for example, when you want to add or subtract a constant from a -tensor, or multiply every element of a tensor by a scalar. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = a + a.constant(2.0f); - Eigen::Tensor c = b * b.constant(0.2f); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - cout << "c" << endl << c << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 3 3 3 - 3 3 3 - - c - 0.6 0.6 0.6 - 0.6 0.6 0.6 - -### random() - -Returns a tensor of the same type and dimensions as the current tensor -but where all elements have random values. - -This is for example useful to add random values to an existing tensor. -The generation of random values can be customized in the same manner -as for ```setRandom()```. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = a + a.random(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 1.68038 1.5662 1.82329 - 0.788766 1.59688 0.395103 - - -## Unary Element Wise Operations - -All these operations take a single input tensor as argument and return a tensor -of the same type and dimensions as the tensor to which they are applied. The -requested operations are applied to each element independently. - -### operator-() - -Returns a tensor of the same type and dimensions as the original tensor -containing the opposite values of the original tensor. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = -a; - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - -1 -1 -1 - -1 -1 -1 - -### sqrt() - -Returns a tensor of the same type and dimensions as the original tensor -containing the square roots of the original tensor. - -### rsqrt() - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse square roots of the original tensor. - -### square() - -Returns a tensor of the same type and dimensions as the original tensor -containing the squares of the original tensor values. - -### inverse() - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse of the original tensor values. - -### exp() - -Returns a tensor of the same type and dimensions as the original tensor -containing the exponential of the original tensor. - -### log() - -Returns a tensor of the same type and dimensions as the original tensor -containing the natural logarithms of the original tensor. - -### abs() - -Returns a tensor of the same type and dimensions as the original tensor -containing the absolute values of the original tensor. - -### pow(Scalar exponent) - -Returns a tensor of the same type and dimensions as the original tensor -containing the coefficients of the original tensor to the power of the -exponent. - -The type of the exponent, Scalar, is always the same as the type of the -tensor coefficients. For example, only integer exponents can be used in -conjuntion with tensors of integer values. - -You can use cast() to lift this restriction. For example this computes -cubic roots of an int Tensor: - - Eigen::Tensor a(2, 3); - a.setValues({{0, 1, 8}, {27, 64, 125}}); - Eigen::Tensor b = a.cast().pow(1.0 / 3.0); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 8 - 27 64 125 - - b - 0 1 2 - 3 4 5 - -### operator * (Scalar scale) - -Multiplies all the coefficients of the input tensor by the provided scale. - -### cwiseMax(Scalar threshold) -TODO - -### cwiseMin(Scalar threshold) -TODO - -### unaryExpr(const CustomUnaryOp& func) -TODO - - -## Binary Element Wise Operations - -These operations take two input tensors as arguments. The 2 input tensors should -be of the same type and dimensions. The result is a tensor of the same -dimensions as the tensors to which they are applied, and unless otherwise -specified it is also of the same type. The requested operations are applied to -each pair of elements independently. - -### operator+(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise sums of the inputs. - -### operator-(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise differences of the inputs. - -### operator*(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise products of the inputs. - -### operator/(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise quotients of the inputs. - -This operator is not supported for integer types. - -### cwiseMax(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise maximums of the inputs. - -### cwiseMin(const OtherDerived& other) - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise mimimums of the inputs. - -### Logical operators - -The following logical operators are supported as well: - -* operator&&(const OtherDerived& other) -* operator||(const OtherDerived& other) -* operator<(const OtherDerived& other) -* operator<=(const OtherDerived& other) -* operator>(const OtherDerived& other) -* operator>=(const OtherDerived& other) -* operator==(const OtherDerived& other) -* operator!=(const OtherDerived& other) - -They all return a tensor of boolean values. - - -## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) - -Selection is a coefficient-wise ternary operator that is the tensor equivalent -to the if-then-else operation. - - Tensor if = ...; - Tensor then = ...; - Tensor else = ...; - Tensor result = if.select(then, else); - -The 3 arguments must be of the same dimensions, which will also be the dimension -of the result. The 'if' tensor must be of type boolean, the 'then' and the -'else' tensor must be of the same type, which will also be the type of the -result. - -Each coefficient in the result is equal to the corresponding coefficient in the -'then' tensor if the corresponding value in the 'if' tensor is true. If not, the -resulting coefficient will come from the 'else' tensor. - - -## Contraction - -Tensor *contractions* are a generalization of the matrix product to the -multidimensional case. - - // Create 2 matrices using tensors of rank 2 - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - Eigen::Tensor b(3, 2); - a.setValues({{1, 2}, {4, 5}, {5, 6}}); - - // Compute the traditional matrix product - array, 1> product_dims = { IndexPair(1, 0) }; - Eigen::Tensor AB = a.contract(b, product_dims); - - // Compute the product of the transpose of the matrices - array, 1> transpose_product_dims = { IndexPair(0, 1) }; - Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); - - -## Reduction Operations - -A *Reduction* operation returns a tensor with fewer dimensions than the -original tensor. The values in the returned tensor are computed by applying a -*reduction operator* to slices of values from the original tensor. You specify -the dimensions along which the slices are made. - -The Eigen Tensor library provides a set of predefined reduction operators such -as ```maximum()``` and ```sum()``` and lets you define additional operators by -implementing a few methods from a reductor template. - -### Reduction Dimensions - -All reduction operations take a single parameter of type -```::Dimensions``` which can always be specified as an array of -ints. These are called the "reduction dimensions." The values are the indices -of the dimensions of the input tensor over which the reduction is done. The -parameter can have at most as many element as the rank of the input tensor; -each element must be less than the tensor rank, as it indicates one of the -dimensions to reduce. - -Each dimension of the input tensor should occur at most once in the reduction -dimensions as the implementation does not remove duplicates. - -The order of the values in the reduction dimensions does not affect the -results, but the code may execute faster if you list the dimensions in -increasing order. - -Example: Reduction along one dimension. - - // Create a tensor of 2 dimensions - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - // Reduce it along the second dimension (1)... - Eigen::array dims({1 /* dimension to reduce */}); - // ...using the "maximum" operator. - // The result is a tensor with one dimension. The size of - // that dimension is the same as the first (non-reduced) dimension of a. - Eigen::Tensor b = a.maximum(dims); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 6 5 4 - - b - 3 - 6 - -Example: Reduction along two dimensions. - - Eigen::Tensor a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // The tensor a has 3 dimensions. We reduce along the - // first 2, resulting in a tensor with a single dimension - // of size 4 (the last dimension of a.) - // Note that we pass the array of reduction dimensions - // directly to the maximum() call. - Eigen::Tensor b = - a.maximum(Eigen::array({0, 1})); - cout << "b" << endl << b << endl << endl; - => - b - 20 - 21 - 22 - 23 - -#### Reduction along all dimensions - -As a special case, if you pass no parameter to a reduction operation the -original tensor is reduced along *all* its dimensions. The result is a -scalar, represented as a zero-dimension tensor. - - Eigen::Tensor a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // Reduce along all dimensions using the sum() operator. - Eigen::Tensor b = a.sum(); - cout << "b" << endl << b << endl << endl; - => - b - 276 - - -### sum(const Dimensions& new_dims) -### sum() - -Reduce a tensor using the sum() operator. The resulting values -are the sum of the reduced values. - -### mean(const Dimensions& new_dims) -### mean() - -Reduce a tensor using the mean() operator. The resulting values -are the mean of the reduced values. - -### maximum(const Dimensions& new_dims) -### maximum() - -Reduce a tensor using the maximum() operator. The resulting values are the -largest of the reduced values. - -### minimum(const Dimensions& new_dims) -### minimum() - -Reduce a tensor using the minimum() operator. The resulting values -are the smallest of the reduced values. - -### prod(const Dimensions& new_dims) -### prod() - -Reduce a tensor using the prod() operator. The resulting values -are the product of the reduced values. - -### all(const Dimensions& new_dims) -### all() -Reduce a tensor using the all() operator. Casts tensor to bool and then checks -whether all elements are true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - -### any(const Dimensions& new_dims) -### any() -Reduce a tensor using the any() operator. Casts tensor to bool and then checks -whether any element is true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - - -### reduce(const Dimensions& new_dims, const Reducer& reducer) - -Reduce a tensor using a user-defined reduction operator. See ```SumReducer``` -in TensorFunctors.h for information on how to implement a reduction operator. - - -## Scan Operations - -A *Scan* operation returns a tensor with the same dimensions as the original -tensor. The operation performs an inclusive scan along the specified -axis, which means it computes a running total along the axis for a given -reduction operation. -If the reduction operation corresponds to summation, then this computes the -prefix sum of the tensor along the given axis. - -Example: -dd a comment to this line - - // Create a tensor of 2 dimensions - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {4, 5, 6}}); - // Scan it along the second dimension (1) using summation - Eigen::Tensor b = a.cumsum(1); - // The result is a tensor with the same size as the input - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 6 5 4 - - b - 1 3 6 - 4 9 15 - -### cumsum(const Index& axis) - -Perform a scan by summing consecutive entries. - -### cumprod(const Index& axis) - -Perform a scan by multiplying consecutive entries. - - -## Convolutions - -### convolve(const Kernel& kernel, const Dimensions& dims) - -Returns a tensor that is the output of the convolution of the input tensor with the kernel, -along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor -which were part of the convolution will be reduced by the formula: -output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). -The dimension sizes for dimensions that were not part of the convolution will remain the same. -Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the -convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is -for the last dimension). - - // Compute convolution along the second and third dimension. - Tensor input(3, 3, 7, 11); - Tensor kernel(2, 2); - Tensor output(3, 2, 6, 11); - input.setRandom(); - kernel.setRandom(); - - Eigen::array dims({1, 2}); // Specify second and third dimension for convolution. - output = input.convolve(kernel, dims); - - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 6; ++k) { - for (int l = 0; l < 11; ++l) { - const float result = output(i,j,k,l); - const float expected = input(i,j+0,k+0,l) * kernel(0,0) + - input(i,j+1,k+0,l) * kernel(1,0) + - input(i,j+0,k+1,l) * kernel(0,1) + - input(i,j+1,k+1,l) * kernel(1,1); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - -## Geometrical Operations - -These operations return a Tensor with different dimensions than the original -Tensor. They can be used to access slices of tensors, see them with different -dimensions, or pad tensors with additional data. - -### reshape(const Dimensions& new_dims) - -Returns a view of the input tensor that has been reshaped to the specified -new dimensions. The argument new_dims is an array of Index values. The -rank of the resulting tensor is equal to the number of elements in new_dims. - -The product of all the sizes in the new dimension array must be equal to -the number of elements in the input tensor. - - // Increase the rank of the input tensor by introducing a new dimension - // of size 1. - Tensor input(7, 11); - array three_dims{{7, 11, 1}}; - Tensor result = input.reshape(three_dims); - - // Decrease the rank of the input tensor by merging 2 dimensions; - array one_dim{{7 * 11}}; - Tensor result = input.reshape(one_dim); - -This operation does not move any data in the input tensor, so the resulting -contents of a reshaped Tensor depend on the data layout of the original Tensor. - -For example this is what happens when you ```reshape()``` a 2D ColMajor tensor -to one dimension: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array one_dim({3 * 2}); - Eigen::Tensor b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -This is what happens when the 2D Tensor is RowMajor: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array one_dim({3 * 2}); - Eigen::Tensor b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 100 - 200 - 300 - 400 - 500 - -The reshape operation is a lvalue. In other words, it can be used on the left -side of the assignment operator. - -The previous example can be rewritten as follow: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array two_dim({2, 3}); - Eigen::Tensor b; - b.reshape(two_dim) = a; - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -Note that "b" itself was not reshaped but that instead the assignment is done to -the reshape view of b. - - -### shuffle(const Shuffle& shuffle) - -Returns a copy of the input tensor whose dimensions have been -reordered according to the specified permutation. The argument shuffle -is an array of Index values. Its size is the rank of the input -tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th -dimension of the output tensor equals to the size of the shuffle[i]-th -dimension of the input tensor. For example: - - // Shuffle all dimensions to the left by 1. - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output = input.shuffle({1, 2, 0}) - - eigen_assert(output.dimension(0) == 30); - eigen_assert(output.dimension(1) == 50); - eigen_assert(output.dimension(2) == 20); - -Indices into the output tensor are shuffled accordingly to formulate -indices into the input tensor. For example, one can assert in the above -code snippet that: - - eigen_assert(output(3, 7, 11) == input(11, 3, 7)); - -In general, one can assert that - - eigen_assert(output(..., indices[shuffle[i]], ...) == - input(..., indices[i], ...)) - -The shuffle operation results in a lvalue, which means that it can be assigned -to. In other words, it can be used on the left side of the assignment operator. - -Let's rewrite the previous example to take advantage of this feature: - - // Shuffle all dimensions to the left by 1. - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output(30, 50, 20); - output.shuffle({2, 0, 1}) = input; - - -### stride(const Strides& strides) - -Returns a view of the input tensor that strides (skips stride-1 -elements) along each of the dimensions. The argument strides is an -array of Index values. The dimensions of the resulting tensor are -ceil(input_dimensions[i] / strides[i]). - -For example this is what happens when you ```stride()``` a 2D tensor: - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array strides({3, 2}); - Eigen::Tensor b = a.stride(strides); - cout << "b" << endl << b << endl; - => - b - 0 200 - 900 1100 - -It is possible to assign a tensor to a stride: - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output(40, 90, 200); - output.stride({2, 3, 4}) = input; - - -### slice(const StartIndices& offsets, const Sizes& extents) - -Returns a sub-tensor of the given tensor. For each dimension i, the slice is -made of the coefficients stored between offset[i] and offset[i] + extents[i] in -the input tensor. - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array offsets = {1, 0}; - Eigen::array extents = {2, 2}; - Eigen::Tensor slice = a.slice(offsets, extents); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "slice" << endl << slice << endl; - => - slice - 300 400 - 600 700 - - -### chip(const Index offset, const Index dim) - -A chip is a special kind of slice. It is the subtensor at the given offset in -the dimension dim. The returned tensor has one fewer dimension than the input -tensor: the dimension dim is removed. - -For example, a matrix chip would be either a row or a column of the input -matrix. - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::Tensor row_3 = a.chip(2, 0); - Eigen::Tensor col_2 = a.chip(1, 1); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "row_3" << endl << row_3 << endl; - => - row_3 - 600 700 800 - cout << "col_2" << endl << col_2 << endl; - => - col_2 - 100 400 700 1000 - -It is possible to assign values to a tensor chip since the chip operation is a -lvalue. For example: - - Eigen::Tensor a(3); - a.setValues({{100, 200, 300}}); - Eigen::Tensor b(2, 3); - b.setZero(); - b.chip(0, 0) = a; - cout << "a" << endl << a << endl; - => - a - 100 - 200 - 300 - cout << "b" << endl << b << endl; - => - b - 100 200 300 - 0 0 0 - - -### reverse(const ReverseDimensions& reverse) - -Returns a view of the input tensor that reverses the order of the coefficients -along a subset of the dimensions. The argument reverse is an array of boolean -values that indicates whether or not the order of the coefficients should be -reversed along each of the dimensions. This operation preserves the dimensions -of the input tensor. - -For example this is what happens when you ```reverse()``` the first dimension -of a 2D tensor: - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array reverse({true, false}); - Eigen::Tensor b = a.reverse(reverse); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - b - 900 1000 1100 - 600 700 800 - 300 400 500 - 0 100 200 - - -### broadcast(const Broadcast& broadcast) - -Returns a view of the input tensor in which the input is replicated one to many -times. -The broadcast argument specifies how many copies of the input tensor need to be -made in each of the dimensions. - - Eigen::Tensor a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array bcast({3, 2}); - Eigen::Tensor b = a.broadcast(bcast); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - -### concatenate(const OtherDerived& other, Axis axis) - -TODO - -### pad(const PaddingDimensions& padding) - -Returns a view of the input tensor in which the input is padded with zeros. - - Eigen::Tensor a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array, 2> paddings; - paddings[0] = make_pair(0, 1); - paddings[1] = make_pair(2, 3); - Eigen::Tensor b = a.pad(paddings); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 0 0 0 - 0 0 0 0 - 0 100 200 0 - 300 400 500 0 - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### extract_patches(const PatchDims& patch_dims) - -Returns a tensor of coefficient patches extracted from the input tensor, where -each patch is of dimension specified by 'patch_dims'. The returned tensor has -one greater dimension than the input tensor, which is used to index each patch. -The patch index in the output tensor depends on the data layout of the input -tensor: the patch index is the last dimension ColMajor layout, and the first -dimension in RowMajor layout. - -For example, given the following input tensor: - - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); - - cout << "tensor: " << endl << tensor << endl; -=> -tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - -Six 2x2 patches can be extracted and indexed using the following code: - - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } - } - cout << endl; - } - } - -This code results in the following output when the data layout is ColMajor: - -patch index: 0 -0 1 -4 5 -patch index: 1 -4 5 -8 9 -patch index: 2 -1 2 -5 6 -patch index: 3 -5 6 -9 10 -patch index: 4 -2 3 -6 7 -patch index: 5 -6 7 -10 11 - -This code results in the following output when the data layout is RowMajor: -(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). - -patch index: 0 -0 1 -4 5 -patch index: 1 -1 2 -5 6 -patch index: 2 -2 3 -6 7 -patch index: 3 -4 5 -8 9 -patch index: 4 -5 6 -9 10 -patch index: 5 -6 7 -10 11 - -### extract_image_patches(const Index patch_rows, const Index patch_cols, - const Index row_stride, const Index col_stride, - const PaddingType padding_type) - -Returns a tensor of coefficient image patches extracted from the input tensor, -which is expected to have dimensions ordered as follows (depending on the data -layout of the input tensor, and the number of additional dimensions 'N'): - -*) ColMajor -1st dimension: channels (of size d) -2nd dimension: rows (of size r) -3rd dimension: columns (of size c) -4th-Nth dimension: time (for video) or batch (for bulk processing). - -*) RowMajor (reverse order of ColMajor) -1st-Nth dimension: time (for video) or batch (for bulk processing). -N+1'th dimension: columns (of size c) -N+2'th dimension: rows (of size r) -N+3'th dimension: channels (of size d) - -The returned tensor has one greater dimension than the input tensor, which is -used to index each patch. The patch index in the output tensor depends on the -data layout of the input tensor: the patch index is the 4'th dimension in -ColMajor layout, and the 4'th from the last dimension in RowMajor layout. - -For example, given the following input tensor with the following dimension -sizes: - *) depth: 2 - *) rows: 3 - *) columns: 5 - *) batch: 7 - - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); - -2x2 image patches can be extracted and indexed using the following code: - -*) 2D patch: ColMajor (patch indexed by second-to-last dimension) - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 - -*) 2D patch: RowMajor (patch indexed by the second dimension) - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 - -## Special Operations - -### cast() - -Returns a tensor of type T with the same dimensions as the original tensor. -The returned tensor contains the values of the original tensor converted to -type T. - - Eigen::Tensor a(2, 3); - Eigen::Tensor b = a.cast(); - -This can be useful for example if you need to do element-wise division of -Tensors of integers. This is not currently supported by the Tensor library -but you can easily cast the tensors to floats to do the division: - - Eigen::Tensor a(2, 3); - a.setValues({{0, 1, 2}, {3, 4, 5}}); - Eigen::Tensor b = - (a.cast() / a.constant(2).cast()).cast(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 2 - 3 4 5 - - b - 0 0 1 - 1 2 2 - - -### eval() - -TODO - - -## Representation of scalar values - -Scalar values are often represented by tensors of size 1 and rank 1. It would be -more logical and user friendly to use tensors of rank 0 instead. For example -Tensor::maximum() currently returns a Tensor. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 1d tensor. In the -future these operations might be updated to return 0d tensors instead. - -## Limitations - -* The number of tensor dimensions is currently limited to 250 when using a - compiler that supports cxx11. It is limited to only 5 for older compilers. -* The IndexList class requires a cxx11 compliant compiler. You can use an - array of indices instead if you don't have access to a modern compiler. -* On GPUs only floating point values are properly tested and optimized for. -* Complex and integer values are known to be broken on GPUs. If you try to use - them you'll most likely end up triggering a static assertion failure such as - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - - diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h deleted file mode 100644 index 1940a969..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ /dev/null @@ -1,527 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_H - -namespace Eigen { - -/** \class Tensor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor class. - * - * The %Tensor class is the work-horse for all \em dense tensors within Eigen. - * - * The %Tensor class encompasses only dynamic-size objects so far. - * - * The first two template parameters are required: - * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex. - * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). - * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) - * - * The remaining template parameters are optional -- in most cases you don't have to worry about them. - * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either - * \b #AutoAlign or \b #DontAlign. - * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required - * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. - * Support for such operations (i.e. adding two tensors etc.) is planned. - * - * You can access elements of tensors using normal subscripting: - * - * \code - * Eigen::Tensor t(10, 10, 10, 10); - * t(0, 1, 2, 3) = 42.0; - * \endcode - * - * This class can be extended with the help of the plugin mechanism described on the page - * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. - * - * Some notes: - * - *

- *
Relation to other parts of Eigen:
- *
The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that - * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code - * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor - * class does not provide any of these features and is only available as a stand-alone class that just allows for - * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to - * change dramatically.
- *
- * - * \ref TopicStorageOrders - */ - -template -class Tensor : public TensorBase > -{ - public: - typedef Tensor Self; - typedef TensorBase > Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - static const int Options = Options_; - static const int NumIndices = NumIndices_; - typedef DSizes Dimensions; - - protected: - TensorStorage m_storage; - -#ifdef EIGEN_HAS_SFINAE - template - struct isOfNormalIndex{ - static const bool is_array = internal::is_base_of, CustomIndices>::value; - static const bool is_int = NumTraits::IsInteger; - static const bool value = is_array | is_int; - }; -#endif - - public: - // Metadata - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - return coeff(array(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - return coeff(array(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - return coeff(array(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - return coeff(array(i0, i1, i2, i3, i4)); - } -#endif - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array(indices)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - return coeff(indices); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - return coeffRef(array(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - return coeffRef(array(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - return coeffRef(array(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - return coeffRef(array(i0, i1, i2, i3, i4)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - return coeffRef(indices); - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(firstDimension, otherDimensions...) - { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) - : m_storage(dim1, array(dim1)) - { - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) - : m_storage(dim1*dim2, array(dim1, dim2)) - { - EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) - : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) - { - EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) - : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) - { - EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) - : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) - { - EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) - { - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - void resize(Index firstDimension, IndexTypes... otherDimensions) - { - // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array{{firstDimension, otherDimensions...}}); - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC void resize(const array& dimensions) - { - int i; - Index size = Index(1); - for (i = 0; i < NumIndices; i++) { - internal::check_rows_cols_for_overflow::run(size, dimensions[i]); - size *= dimensions[i]; - } - #ifdef EIGEN_INITIALIZE_COEFFS - bool size_changed = size != this->size(); - m_storage.resize(size, dimensions); - if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - #else - m_storage.resize(size, dimensions); - #endif - } - - // Why this overload, DSizes is derived from array ??? // - EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = dimensions[i]; - } - resize(dims); - } - - EIGEN_DEVICE_FUNC - void resize() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - // Nothing to do: rank 0 tensors have fixed size - } - - /** Custom Dimension */ -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) - { - resize(internal::customIndices2Array(dimensions)); - } -#endif - -#ifndef EIGEN_EMULATE_CXX11_META_H - template - EIGEN_DEVICE_FUNC - void resize(const Sizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast(dimensions[i]); - } - resize(dims); - } -#else - template - EIGEN_DEVICE_FUNC - void resize(const Sizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast(dimensions[i]); - } - resize(dims); - } -#endif - - protected: - - bool checkIndexRange(const array& indices) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return - // check whether the indices are all >= 0 - array_apply_and_reduce(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce(indices, m_storage.dimensions()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h deleted file mode 100644 index d06f40cd..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ /dev/null @@ -1,299 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Eugene Brevdo -// Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H -#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H - -namespace Eigen { -namespace internal { - -/** \class TensorIndexTuple - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor + Index Tuple class. - * - * - */ -template -struct traits > : public traits -{ - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Tuple Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorIndexTupleOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorIndexTupleOp type; -}; - -} // end namespace internal - -template -class TensorIndexTupleOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - typedef Tuple CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorIndexTupleOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return CoeffReturnType(index, m_impl.coeff(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - TensorEvaluator m_impl; -}; - -namespace internal { - -/** \class TensorTupleIndex - * \ingroup CXX11_Tensor_Module - * - * \brief Converts to Tensor > and reduces to Tensor. - * - */ -template -struct traits > : public traits -{ - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Index Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorTupleReducerOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorTupleReducerOp type; -}; - -} // end namespace internal - -template -class TensorTupleReducerOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - typedef Index CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, - const ReduceOp& reduce_op, - const int return_dim, - const Dims& reduce_dims) - : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const ReduceOp& reduce_op() const { return m_reduce_op; } - - EIGEN_DEVICE_FUNC - const Dims& reduce_dims() const { return m_reduce_dims; } - - EIGEN_DEVICE_FUNC - int return_dim() const { return m_return_dim; } - - protected: - typename XprType::Nested m_xpr; - const ReduceOp m_reduce_op; - const int m_return_dim; - const Dims m_reduce_dims; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorTupleReducerOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename TensorIndexTupleOp::CoeffReturnType TupleType; - typedef typename TensorEvaluator >, Device>::Dimensions Dimensions; - typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; - static const int NumDims = internal::array_size::value; - typedef array StrideDims; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_orig_impl(op.expression(), device), - m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { - - gen_strides(m_orig_impl.dimensions(), m_strides); - if (Layout == static_cast(ColMajor)) { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; - } else { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = m_strides[m_return_dim]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - const TupleType v = m_impl.coeff(index); - return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = 1.0 + - (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); - return m_orig_impl.costPerCoeff(vectorized) + - m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); - } - - private: - EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { - if (m_return_dim < 0) { - return; // Won't be using the strides. - } - eigen_assert(m_return_dim < NumDims && - "Asking to convert index to a dimension outside of the rank"); - - // Calculate m_stride_div and m_stride_mod, which are used to - // calculate the value of an index w.r.t. the m_return_dim. - if (Layout == static_cast(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i-1] * dims[i-1]; - } - } else { - strides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i+1] * dims[i+1]; - } - } - } - - protected: - TensorEvaluator, Device> m_orig_impl; - TensorEvaluator >, Device> m_impl; - const int m_return_dim; - StrideDims m_strides; - Index m_stride_mod; - Index m_stride_div; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h deleted file mode 100644 index 166be200..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ /dev/null @@ -1,181 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H - -namespace Eigen { - -/** \class TensorAssign - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor assignment class. - * - * This class is represents the assignment of the values resulting from the evaluation of - * the rhs expression to the memory locations denoted by the lhs expression. - */ -namespace internal { -template -struct traits > -{ - typedef typename LhsXprType::Scalar Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const std::size_t NumDimensions = internal::traits::NumDimensions; - static const int Layout = internal::traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorAssignOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorAssignOp type; -}; - -} // end namespace internal - - - -template -class TensorAssignOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename LhsXprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename internal::remove_all::type& m_lhs_xpr; - const typename internal::remove_all::type& m_rhs_xpr; -}; - - -template -struct TensorEvaluator, Device> -{ - typedef TensorAssignOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // The dimensions of the lhs and the rhs tensors should be equal to prevent - // overflows and ensure the result is fully initialized. - // TODO: use left impl instead if right impl dimensions are known at compile time. - return m_rightImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - m_leftImpl.evalSubExprsIfNeeded(NULL); - // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non - // null value), attempt to evaluate the rhs expression in place. Returns true iff in place - // evaluation isn't supported and the caller still needs to manually assign the values generated - // by the rhs to the lhs. - return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); - } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_leftImpl.coeff(index); - } - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - return m_leftImpl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here, but reduce left - // cost by one load because we are using m_leftImpl.coeffRef. - TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); - return m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost( - numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), - left.bytes_stored(), left.compute_cycles()) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } - - private: - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; -}; - -} - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h deleted file mode 100644 index 7a45a5cf..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ /dev/null @@ -1,1010 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H -#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H - -// clang-format off - -namespace Eigen { - -/** \class TensorBase - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor base class. - * - * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangably in expressions. - */ - -template -class TensorBase -{ - public: - typedef internal::traits DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - // Generic nullary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp - nullaryExpr(const CustomNullaryOp& func) const { - return TensorCwiseNullaryOp(derived(), func); - } - - // Coefficient-wise nullary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> - constant(const Scalar& value) const { - return nullaryExpr(internal::scalar_constant_op(value)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> - random() const { - return nullaryExpr(internal::UniformRandomGenerator()); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp - random(const RandomGenerator& gen = RandomGenerator()) const { - return nullaryExpr(gen); - } - - // Tensor generation - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorGeneratorOp - generate(const Generator& generator) const { - return TensorGeneratorOp(derived(), generator); - } - - // Generic unary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp - unaryExpr(const CustomUnaryOp& func) const { - return TensorCwiseUnaryOp(derived(), func); - } - - // Coefficient-wise unary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator-() const { - return unaryExpr(internal::scalar_opposite_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sqrt() const { - return unaryExpr(internal::scalar_sqrt_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sign() const { - return unaryExpr(internal::scalar_sign_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - rsqrt() const { - return unaryExpr(internal::scalar_rsqrt_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - square() const { - return unaryExpr(internal::scalar_square_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cube() const { - return unaryExpr(internal::scalar_cube_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - inverse() const { - return unaryExpr(internal::scalar_inverse_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - tanh() const { - return unaryExpr(internal::scalar_tanh_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - lgamma() const { - return unaryExpr(internal::scalar_lgamma_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - digamma() const { - return unaryExpr(internal::scalar_digamma_op()); - } - - // igamma(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_op()); - } - - // igammac(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op()); - } - - // zeta(x = this, q = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - zeta(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_zeta_op()); - } - - // polygamma(n = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - polygamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_polygamma_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - erf() const { - return unaryExpr(internal::scalar_erf_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - erfc() const { - return unaryExpr(internal::scalar_erfc_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sigmoid() const { - return unaryExpr(internal::scalar_sigmoid_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - exp() const { - return unaryExpr(internal::scalar_exp_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log() const { - return unaryExpr(internal::scalar_log_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log1p() const { - return unaryExpr(internal::scalar_log1p_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - abs() const { - return unaryExpr(internal::scalar_abs_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - conjugate() const { - return unaryExpr(internal::scalar_conjugate_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - pow(Scalar exponent) const { - return unaryExpr(internal::bind2nd_op >(exponent)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - real() const { - return unaryExpr(internal::scalar_real_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - imag() const { - return unaryExpr(internal::scalar_imag_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator+ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator+ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator- (Scalar rhs) const { - EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator- (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator* (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator* (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator/ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator/ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator% (Scalar rhs) const { - EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); - return unaryExpr(internal::scalar_mod_op(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - cwiseMax(Scalar threshold) const { - return cwiseMax(constant(threshold)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - cwiseMin(Scalar threshold) const { - return cwiseMin(constant(threshold)); - } - - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp - cast() const { - return TensorConversionOp(derived()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - round() const { - return unaryExpr(internal::scalar_round_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - ceil() const { - return unaryExpr(internal::scalar_ceil_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - floor() const { - return unaryExpr(internal::scalar_floor_op()); - } - - // Generic binary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp - binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { - return TensorCwiseBinaryOp(derived(), other, func); - } - - // Coefficient-wise binary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator+(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_sum_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator-(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_difference_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator*(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_product_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator/(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_quotient_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMax(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_max_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMin(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_min_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator&&(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator||(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator^(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); - } - - // Comparisons and tests. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - // comparisons and tests for Scalars - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator<(Scalar threshold) const { - return operator<(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator<=(Scalar threshold) const { - return operator<=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator>(Scalar threshold) const { - return operator>(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator>=(Scalar threshold) const { - return operator>=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator==(Scalar threshold) const { - return operator==(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator!=(Scalar threshold) const { - return operator!=(constant(threshold)); - } - - // Checks - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isnan)() const { - return unaryExpr(internal::scalar_isnan_op()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isinf)() const { - return unaryExpr(internal::scalar_isinf_op()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isfinite)() const { - return unaryExpr(internal::scalar_isfinite_op()); - } - - // Coefficient-wise ternary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { - return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); - } - - // Contractions. - typedef Eigen::IndexPair DimensionPair; - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp - contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); - } - - // Convolutions. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConvolutionOp - convolve(const KernelDerived& kernel, const Dimensions& dims) const { - return TensorConvolutionOp(derived(), kernel.derived(), dims); - } - - // Fourier transforms - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorFFTOp - fft(const FFT& fft) const { - return TensorFFTOp(derived(), fft); - } - - // Scan. - typedef TensorScanOp, const Derived> TensorScanSumOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanSumOp - cumsum(const Index& axis, bool exclusive = false) const { - return TensorScanSumOp(derived(), axis, exclusive); - } - - typedef TensorScanOp, const Derived> TensorScanProdOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanProdOp - cumprod(const Index& axis, bool exclusive = false) const { - return TensorScanProdOp(derived(), axis, exclusive); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanOp - scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { - return TensorScanOp(derived(), axis, exclusive, reducer); - } - - // Reductions. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - sum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - sum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - mean(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - mean() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - prod(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - prod() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - maximum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - minimum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > - all(const Dims& dims) const { - return cast().reduce(dims, internal::AndReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > - all() const { - DimensionList in_dims; - return cast().reduce(in_dims, internal::AndReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > - any(const Dims& dims) const { - return cast().reduce(dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > - any() const { - DimensionList in_dims; - return cast().reduce(in_dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, const Derived> - argmax() const { - array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMaxTupleReducer >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, const Derived> - argmin() const { - array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMinTupleReducer >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, const Derived> - argmax(const int return_dim) const { - array in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMaxTupleReducer >(), return_dim, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, const Derived> - argmin(const int return_dim) const { - array in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMinTupleReducer >(), return_dim, in_dims); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp - reduce(const Dims& dims, const Reducer& reducer) const { - return TensorReductionOp(derived(), dims, reducer); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorBroadcastingOp - broadcast(const Broadcast& broadcast) const { - return TensorBroadcastingOp(derived(), broadcast); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp - concatenate(const OtherDerived& other, Axis axis) const { - return TensorConcatenationOp(derived(), other.derived(), axis); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPatchOp - extract_patches(const PatchDims& patch_dims) const { - return TensorPatchOp(derived(), patch_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp - extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, - const Index row_stride = 1, const Index col_stride = 1, - const Index in_row_stride = 1, const Index in_col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp - extract_image_patches(const Index patch_rows, const Index patch_cols, - const Index row_stride, const Index col_stride, - const Index in_row_stride, const Index in_col_stride, - const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top, const Index padding_bottom, - const Index padding_left,const Index padding_right, - const Scalar padding_value) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, - padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride, const Index row_stride, const Index col_stride, - const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top_z, const Index padding_bottom_z, - const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - // Morphing operators. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp - swap_layout() const { - return TensorLayoutSwapOp(derived()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp(derived(), startIndices, sizes); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset) const { - return TensorChippingOp(derived(), offset, DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset, const Index dim) const { - return TensorChippingOp(derived(), offset, dim); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp(derived(), rev); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp - pad(const PaddingDimensions& padding) const { - return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp - pad(const PaddingDimensions& padding, const Scalar padding_value) const { - return TensorPaddingOp(derived(), padding, padding_value); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp - stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorInflationOp - inflate(const Strides& strides) const { - return TensorInflationOp(derived(), strides); - } - - // Returns a tensor containing index/value tuples - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorIndexTupleOp - index_tuples() const { - return TensorIndexTupleOp(derived()); - } - - // Support for custom unary and binary operations - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { - return TensorCustomUnaryOp(derived(), op); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { - return TensorCustomBinaryOp(derived(), other, op); - } - - // Force the evaluation of the expression. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorForcedEvalOp eval() const { - return TensorForcedEvalOp(derived()); - } - - protected: - template friend class Tensor; - template friend class TensorFixedSize; - template friend class TensorBase; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } -}; - -template::value> -class TensorBase : public TensorBase { - public: - typedef internal::traits DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef Scalar CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - template friend class Tensor; - template friend class TensorFixedSize; - template friend class TensorBase; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = this->constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->random(); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->template random(); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setValues( - const typename internal::Initializer::InitList& vals) { - TensorEvaluator eval(derived(), DefaultDevice()); - internal::initialize_tensor(eval, vals); - return derived(); - } -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator+=(const OtherDerived& other) { - return derived() = derived() + other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator-=(const OtherDerived& other) { - return derived() = derived() - other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator*=(const OtherDerived& other) { - return derived() = derived() * other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator/=(const OtherDerived& other) { - return derived() = derived() / other.derived(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp - swap_layout() const { - return TensorLayoutSwapOp(derived()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorLayoutSwapOp - swap_layout() { - return TensorLayoutSwapOp(derived()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp - concatenate(const OtherDerived& other, const Axis& axis) const { - return TensorConcatenationOp(derived(), other, axis); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorConcatenationOp - concatenate(const OtherDerived& other, const Axis& axis) { - return TensorConcatenationOp(derived(), other, axis); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReshapingOp - reshape(const NewDimensions& newDimensions) { - return TensorReshapingOp(derived(), newDimensions); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp(derived(), startIndices, sizes); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) { - return TensorSlicingOp(derived(), startIndices, sizes); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset) const { - return TensorChippingOp(derived(), offset, DimId); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp - chip(const Index offset) { - return TensorChippingOp(derived(), offset, DimId); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset, const Index dim) const { - return TensorChippingOp(derived(), offset, dim); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp - chip(const Index offset, const Index dim) { - return TensorChippingOp(derived(), offset, dim); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp(derived(), rev); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReverseOp - reverse(const ReverseDimensions& rev) { - return TensorReverseOp(derived(), rev); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp - shuffle(const Shuffle& shuffle) { - return TensorShufflingOp(derived(), shuffle); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp - stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp - stride(const Strides& strides) { - return TensorStridingOp(derived(), strides); - } - - // Select the device on which to evaluate the expression. - template - TensorDevice device(const DeviceType& device) { - return TensorDevice(device, derived()); - } - - protected: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h deleted file mode 100644 index 4cfe300e..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ /dev/null @@ -1,392 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H -#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H - -namespace Eigen { - -/** \class TensorBroadcasting - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor broadcasting class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorBroadcastingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorBroadcastingOp type; -}; - -template -struct is_input_scalar { - static const bool value = false; -}; -template <> -struct is_input_scalar > { - static const bool value = true; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template -struct is_input_scalar > { - static const bool value = (Sizes::total_size == 1); -}; -#endif - -} // end namespace internal - - - -template -class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) - : m_xpr(expr), m_broadcast(broadcast) {} - - EIGEN_DEVICE_FUNC - const Broadcast& broadcast() const { return m_broadcast; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Broadcast m_broadcast; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorBroadcastingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions InputDimensions; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { - // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D - // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - const InputDimensions& input_dims = m_impl.dimensions(); - const Broadcast& broadcast = op.broadcast(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * broadcast[i]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - m_inputStrides[NumDims-1] = 1; - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return m_impl.coeff(0); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return coeffColMajor(index); - } else { - return coeffRowMajor(index); - } - } - - // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - inputIndex += index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[0]); - } - } - return m_impl.coeff(inputIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const - { - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - inputIndex += index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); - } - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return internal::pset1(m_impl.coeff(0)); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } else { - return packetRowMajor(index); - } - } - - // Ignore the LoadMode and always use unaligned loads since we can't guarantee - // the alignment at compile time. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[0]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[NumDims-1]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double compute_cost = TensorOpCost::AddCost(); - if (NumDims > 0) { - for (int i = NumDims - 1; i > 0; --i) { - compute_cost += TensorOpCost::DivCost(); - if (internal::index_statically_eq(i, 1)) { - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } else { - if (!internal::index_statically_eq(i, 1)) { - compute_cost += TensorOpCost::MulCost() + - TensorOpCost::ModCost() + - TensorOpCost::AddCost(); - } - } - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - const TensorEvaluator& impl() const { return m_impl; } - - Broadcast functor() const { return m_broadcast; } - - protected: - const Broadcast m_broadcast; - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h deleted file mode 100644 index 1ba7ef17..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ /dev/null @@ -1,384 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H - -namespace Eigen { - -/** \class TensorKChippingReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. - * - * - */ - -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorChippingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorChippingOp type; -}; - -template -struct DimensionId -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - eigen_assert(dim == DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return DimId; - } -}; -template <> -struct DimensionId -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { - eigen_assert(dim >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return actual_dim; - } - private: - const DenseIndex actual_dim; -}; - - -} // end namespace internal - - - -template -class TensorChippingOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) { - } - - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } - EIGEN_DEVICE_FUNC - const Index dim() const { return m_dim.actualDim(); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const Index m_offset; - const internal::DimensionId m_dim; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorChippingOp XprType; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) - { - EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(NumInputDims > m_dim.actualDim()); - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); - - int j = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (i != m_dim.actualDim()) { - m_dimensions[j] = input_dims[i]; - ++j; - } - } - - m_stride = 1; - m_inputStride = 1; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < m_dim.actualDim(); ++i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } else { - for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } - m_inputStride *= input_dims[m_dim.actualDim()]; - m_inputOffset = m_stride * op.offset(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = m_impl.coeff(inputIndex); - inputIndex += m_inputStride; - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(m_stride > index); - return m_impl.template packet(index + m_inputOffset); - } else { - const Index idx = index / m_stride; - const Index rem = index - idx * m_stride; - if (rem + PacketSize <= m_stride) { - Index inputIndex = idx * m_inputStride + m_inputOffset + rem; - return m_impl.template packet(inputIndex); - } else { - // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index); - ++index; - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double cost = 0; - if ((static_cast(Layout) == static_cast(ColMajor) && - m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && - m_dim.actualDim() == NumInputDims - 1)) { - cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } else if ((static_cast(Layout) == static_cast(ColMajor) && - m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && - m_dim.actualDim() == 0)) { - cost += TensorOpCost::AddCost(); - } else { - cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + - 3 * TensorOpCost::AddCost(); - } - - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { - CoeffReturnType* result = const_cast(m_impl.data()); - if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && - result) { - return result + m_inputOffset; - } else { - return NULL; - } - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex; - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(m_stride > index); - inputIndex = index + m_inputOffset; - } else { - const Index idx = index / m_stride; - inputIndex = idx * m_inputStride + m_inputOffset; - index -= idx * m_stride; - inputIndex += index; - } - return inputIndex; - } - - Dimensions m_dimensions; - Index m_stride; - Index m_inputOffset; - Index m_inputStride; - TensorEvaluator m_impl; - const internal::DimensionId m_dim; - const Device& m_device; -}; - - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorChippingOp XprType; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - - if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - internal::pstore(values, x); - Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < PacketSize; ++i) { - this->m_impl.coeffRef(inputIndex) = values[i]; - inputIndex += this->m_inputStride; - } - } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(this->m_stride > index); - this->m_impl.template writePacket(index + this->m_inputOffset, x); - } else { - const Index idx = index / this->m_stride; - const Index rem = index - idx * this->m_stride; - if (rem + PacketSize <= this->m_stride) { - const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; - this->m_impl.template writePacket(inputIndex, x); - } else { - // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - internal::pstore(values, x); - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index) = values[i]; - ++index; - } - } - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h deleted file mode 100644 index 59bf90d9..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ /dev/null @@ -1,361 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H - -namespace Eigen { - -/** \class TensorConcatenationOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor concatenation class. - * - * - */ -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type::ret Scalar; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - enum { Flags = 0 }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConcatenationOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConcatenationOp type; -}; - -} // end namespace internal - - -template -class TensorConcatenationOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::nested::type Nested; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename NumTraits::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Axis m_axis; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorConcatenationOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - static const int RightNumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - eigen_assert(0 <= m_axis && m_axis < NumDims); - const Dimensions& lhs_dims = m_leftImpl.dimensions(); - const Dimensions& rhs_dims = m_rightImpl.dimensions(); - { - int i = 0; - for (; i < m_axis; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. - eigen_assert(rhs_dims[i] > 0); - m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; - for (++i; i < NumDims; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_leftStrides[0] = 1; - m_rightStrides[0] = 1; - m_outputStrides[0] = 1; - - for (int j = 1; j < NumDims; ++j) { - m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; - m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; - m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; - } - } else { - m_leftStrides[NumDims - 1] = 1; - m_rightStrides[NumDims - 1] = 1; - m_outputStrides[NumDims - 1] = 1; - - for (int j = NumDims - 2; j >= 0; --j) { - m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; - m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; - m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) - { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() - { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. - // See CL/76180724 comments for more ideas. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Collect dimension-wise indices (subs). - array subs; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[NumDims - 1] = index; - } - - const Dimensions& left_dims = m_leftImpl.dimensions(); - if (subs[m_axis] < left_dims[m_axis]) { - Index left_index; - if (static_cast(Layout) == static_cast(ColMajor)) { - left_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } else { - left_index = subs[NumDims - 1]; - for (int i = NumDims - 2; i >= 0; --i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } - return m_leftImpl.coeff(left_index); - } else { - subs[m_axis] -= left_dims[m_axis]; - const Dimensions& right_dims = m_rightImpl.dimensions(); - Index right_index; - if (static_cast(Layout) == static_cast(ColMajor)) { - right_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } else { - right_index = subs[NumDims - 1]; - for (int i = NumDims - 2; i >= 0; --i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } - return m_rightImpl.coeff(right_index); - } - } - - // TODO(phli): Add a real vectorization. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost() + - TensorOpCost::ModCost()); - const double lhs_size = m_leftImpl.dimensions().TotalSize(); - const double rhs_size = m_rightImpl.dimensions().TotalSize(); - return (lhs_size / (lhs_size + rhs_size)) * - m_leftImpl.costPerCoeff(vectorized) + - (rhs_size / (lhs_size + rhs_size)) * - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_leftStrides; - array m_rightStrides; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; - const Axis m_axis; -}; - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorConcatenationOp XprType; - typedef typename Base::Dimensions Dimensions; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) - : Base(op, device) - { - EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - // Collect dimension-wise indices (subs). - array subs; - for (int i = Base::NumDims - 1; i > 0; --i) { - subs[i] = index / this->m_outputStrides[i]; - index -= subs[i] * this->m_outputStrides[i]; - } - subs[0] = index; - - const Dimensions& left_dims = this->m_leftImpl.dimensions(); - if (subs[this->m_axis] < left_dims[this->m_axis]) { - Index left_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; - } - return this->m_leftImpl.coeffRef(left_index); - } else { - subs[this->m_axis] -= left_dims[this->m_axis]; - const Dimensions& right_dims = this->m_rightImpl.dimensions(); - Index right_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; - } - return this->m_rightImpl.coeffRef(right_index); - } - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { - coeffRef(index+i) = values[i]; - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h deleted file mode 100644 index 20b29e5f..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ /dev/null @@ -1,628 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H - -namespace Eigen { - -/** \class TensorContraction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor contraction class. - * - * - */ -namespace internal { - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename gebp_traits::type, - typename remove_const::type>::ResScalar Scalar; - - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorContractionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorContractionOp type; -}; - -template -struct traits, Device_> > { - typedef Indices_ Indices; - typedef LeftArgType_ LeftArgType; - typedef RightArgType_ RightArgType; - typedef Device_ Device; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; -}; - -} // end namespace internal - -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename internal::gebp_traits::ResScalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Indices m_indices; -}; - - -template -struct TensorContractionEvaluatorBase -{ - typedef typename internal::traits::Indices Indices; - typedef typename internal::traits::LeftArgType LeftArgType; - typedef typename internal::traits::RightArgType RightArgType; - typedef typename internal::traits::Device Device; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), - m_device(device), - m_result(NULL) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - - DSizes eval_left_dims; - DSizes eval_right_dims; - array, ContractDims> eval_op_indices; - if (static_cast(Layout) == static_cast(ColMajor)) { - // For ColMajor, we keep using the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[i]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[i]; - } - // We keep the pairs of contracting indices. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = op.indices()[i].first; - eval_op_indices[i].second = op.indices()[i].second; - } - } else { - // For RowMajor, we need to reverse the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; - } - // We need to flip all the pairs of contracting indices as well as - // reversing the dimensions. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; - } - } - - // Check for duplicate axes and make sure the first index in eval_op_indices - // is increasing. Using O(n^2) sorting is OK since ContractDims is small - for (int i = 0; i < ContractDims; i++) { - for (int j = i + 1; j < ContractDims; j++) { - eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && - eval_op_indices[j].second != eval_op_indices[i].second && - "contraction axes should be unique"); - if (eval_op_indices[j].first < eval_op_indices[i].first) { - numext::swap(eval_op_indices[j], eval_op_indices[i]); - } - } - } - - array lhs_strides; - lhs_strides[0] = 1; - for (int i = 0; i < LDims-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; - } - - array rhs_strides; - rhs_strides[0] = 1; - for (int i = 0; i < RDims-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; - } - - if (m_i_strides.size() > 0) m_i_strides[0] = 1; - if (m_j_strides.size() > 0) m_j_strides[0] = 1; - if (m_k_strides.size() > 0) m_k_strides[0] = 1; - - m_i_size = 1; - m_j_size = 1; - m_k_size = 1; - - // To compute the dimension, we simply concatenate the non-contracting - // dimensions of the left and then the right tensor. Additionally, we also - // compute the strides corresponding to the left non-contracting - // dimensions and right non-contracting dimensions. - m_lhs_inner_dim_contiguous = true; - int dim_idx = 0; - unsigned int nocontract_idx = 0; - - for (int i = 0; i < LDims; i++) { - // find if we are contracting on index i of left tensor - bool contracting = false; - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].first == i) { - contracting = true; - break; - } - } - if (!contracting) { - // add dimension size to output dimensions - m_dimensions[dim_idx] = eval_left_dims[i]; - m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; - if (dim_idx != i) { - m_lhs_inner_dim_contiguous = false; - } - if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = - m_i_strides[nocontract_idx] * eval_left_dims[i]; - } else { - m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; - } - dim_idx++; - nocontract_idx++; - } - } - - nocontract_idx = 0; - for (int i = 0; i < RDims; i++) { - bool contracting = false; - // find if we are contracting on index i of right tensor - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].second == i) { - contracting = true; - break; - } - } - if (!contracting) { - m_dimensions[dim_idx] = eval_right_dims[i]; - if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = - m_j_strides[nocontract_idx] * eval_right_dims[i]; - } else { - m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; - } - m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; - dim_idx++; - nocontract_idx++; - } - } - - // Now compute the strides corresponding to the contracting dimensions. We - // assumed above that non-contracting axes are represented in the same order - // in the matrix as they are in the tensor. This is not the case for - // contracting axes. As the contracting axes must be of the same size in - // each tensor, we'll only look at the first tensor here. - m_rhs_inner_dim_contiguous = true; - m_rhs_inner_dim_reordered = false; - for (int i = 0; i < ContractDims; i++) { - Index left = eval_op_indices[i].first; - Index right = eval_op_indices[i].second; - - Index size = eval_left_dims[left]; - eigen_assert(size == eval_right_dims[right] && - "Contraction axes must be same size"); - - if (i+1 < static_cast(internal::array_size::value)) { - m_k_strides[i+1] = m_k_strides[i] * size; - } else { - m_k_size = m_k_strides[i] * size; - } - m_left_contracting_strides[i] = lhs_strides[left]; - m_right_contracting_strides[i] = rhs_strides[right]; - - if (i > 0 && right < eval_op_indices[i-1].second) { - m_rhs_inner_dim_reordered = true; - } - if (right != i) { - m_rhs_inner_dim_contiguous = false; - } - } - - // If the layout is RowMajor, we need to reverse the m_dimensions - if (static_cast(Layout) == static_cast(RowMajor)) { - for (int i = 0, j = NumDims - 1; i < j; i++, j--) { - numext::swap(m_dimensions[i], m_dimensions[j]); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - } - - template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { - const Index rows = m_i_size; - const Index cols = m_k_size; - - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; - const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, - m_left_contracting_strides, m_k_strides); - RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, - m_right_contracting_strides, m_k_strides); - - const Scalar alpha(1); - const Index resIncr(1); - - // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) - m_device.memset(buffer, 0, rows * sizeof(Scalar)); - - internal::general_matrix_vector_product::run( - rows, cols, lhs, rhs, - buffer, resIncr, alpha); - } - - template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2m_device.deallocate(blockA); - this->m_device.deallocate(blockB); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } - - protected: - // Prevent assignment - TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - Dimensions m_dimensions; - - contract_t m_k_strides; - contract_t m_left_contracting_strides; - contract_t m_right_contracting_strides; - - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - left_nocontract_t m_i_strides; - right_nocontract_t m_j_strides; - left_nocontract_t m_left_nocontract_strides; - right_nocontract_t m_right_nocontract_strides; - - Index m_i_size; - Index m_j_size; - Index m_k_size; - - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; - const Device& m_device; - Scalar* m_result; -}; - - -// evaluator for default device -template -struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - // Could we use NumDimensions here? - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) { } - - template - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h deleted file mode 100644 index 5cf7b4f7..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ /dev/null @@ -1,56 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H - - -namespace Eigen { -namespace internal { - -enum { - ShardByRow = 0, - ShardByCol = 1 -}; - - -// Default Blocking Strategy -template -class TensorContractionBlocking { - public: - - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; - - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : - kc_(k), mc_(m), nc_(n) - { - if (ShardingType == ShardByCol) { - computeProductBlockingSizes(kc_, mc_, nc_, num_threads); - } - else { - computeProductBlockingSizes(kc_, nc_, mc_, num_threads); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - - private: - Index kc_; - Index mc_; - Index nc_; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h deleted file mode 100644 index d65dbb40..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ /dev/null @@ -1,1391 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -__launch_bounds__(512) -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - lhs_pf0 = internal::pset1(0); - rhs_pf0 = internal::pset1(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and __CUDACC__ -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h deleted file mode 100644 index 9b2cb3ff..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ /dev/null @@ -1,467 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H - -namespace Eigen { - -namespace internal { - -enum { - Rhs = 0, - Lhs = 1 -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ - -template struct CoeffLoader { - enum { - DirectOffsets = false - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { - eigen_assert(false && "unsupported"); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return m_tensor.template packet(index); - } - - - private: - const Tensor m_tensor; -}; - -template struct CoeffLoader { - enum { - DirectOffsets = true - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_data += offset; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return internal::ploadt_ro(m_data + index); - } - private: - typedef typename Tensor::Scalar Scalar; - const Scalar* m_data; -}; - -template -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - enum { - DirectOffsets = CoeffLoader::DirectOffsets - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_tensor.offsetBuffer(offset); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - Index nocontract_val = left ? row : col; - Index linidx = 0; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - if(array_size::value > 0) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - if (array_size::value > array_size::value) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - if (array_size::value> 0) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - } - return IndexPair(linidx[0], linidx[1]); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - // Only claim alignment when we can compute the actual stride (ie when we're - // dealing with the lhs with inner_dim_contiguous. This is because the - // matrix-vector product relies on the stride when dealing with aligned inputs. - return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; - } - - protected: - CoeffLoader m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); - } - - const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index last = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { - - return this->m_tensor.template packet(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(last); - - return pload(data); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); - } -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename Tensor::PacketReturnType Packet; - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } -}; - - -template -class TensorContractionSubMapper { - public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; - typedef Self LinearMapper; - - enum { - // We can use direct offsets iff the parent mapper supports then and we can compute the strides. - // TODO: we should also enable direct offsets for the Rhs case. - UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) - }; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { - // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute - // this offset every time we attempt to access a coefficient. - if (UseDirectOffsets) { - Index stride = m_base_mapper.stride(); - m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper(i, 0); - } - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper(i, j); - } - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); - } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { - if (UseDirectOffsets) { - m_base_mapper.storePacket(i, 0, p); - } - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - if (UseDirectOffsets) { - return LinearMapper(m_base_mapper, i, j); - } - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { - return false; - } - - private: - ParentMapper m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template -class TensorContractionInputMapper - : public BaseTensorContractionMapper { - - public: - typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } -}; - - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h deleted file mode 100644 index ee16cde9..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ /dev/null @@ -1,1052 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H - -// evaluator for thread pool device -#ifdef EIGEN_USE_THREADS - -namespace Eigen { - -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { - - typedef ThreadPoolDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL - template - void evalProduct(Scalar* buffer) const { - typedef - typename internal::remove_const::type - LhsScalar; - typedef - typename internal::remove_const::type - RhsScalar; - typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; - - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - - // Compute a set of algorithm parameters: - // - kernel block sizes (bm, bn, bk) - // - task grain sizes (number of kernels executed per task: gm, gn) - // - number of threads - // - sharding by row/column - // - parallel packing or first lhs then rhs - // and some derived parameters: - // - number of tasks (nm, nn, nk) - // - number of kernels (nm0, nn0) - // Unfortunately, all these parameters are tightly interdependent. - // So in some cases we first compute approximate values, then compute other - // values based on these approximations and then refine the approximations. - - // There are lots of heuristics here. There is some reasoning behind them, - // but ultimately they are just tuned on contraction benchmarks for - // different input configurations, thread counts and instruction sets. - // So feel free to question any of them. - - // Compute whether we want to shard by row or by column. - // This is a first approximation, it will be refined later. Since we don't - // know number of threads yet we use 2, because what's we are most - // interested in at this point is whether it makes sense to use - // parallelization at all or not. - bool shard_by_col = shardByCol(m, n, 2); - - // First approximation of kernel blocking sizes. - // Again, we don't know number of threads yet, so we use 2. - Index bm, bn, bk; - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Compute optimal number of threads. - // Note: we use bk instead of k here because we are interested in amount of - // _parallelizable_ computations, and computations are not parallelizable - // across k dimension. - const TensorOpCost cost = - contractionCost(m, n, bm, bn, bk, shard_by_col, false); - int num_threads = TensorCostModel::numThreads( - static_cast(n) * m, cost, this->m_device.numThreads()); - - // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost - // model is not tuned. Remove this when the cost model is tuned. - if (n == 1) num_threads = 1; - - if (num_threads == 1) { - // The single-threaded algorithm should be faster in this case. - if (n == 1) - this->template evalGemv(buffer); - else - this->template evalGemm(buffer); - return; - } - - // Now that we know number of threads, recalculate sharding and blocking. - shard_by_col = shardByCol(m, n, num_threads); - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Number of kernels for each dimension. - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index nk = divup(k, bk); - - // Calculate task grain size (number of kernels executed per task). - // This task size coarsening serves two purposes: - // 1. It reduces per-task overheads including synchronization overheads. - // 2. It allows to use caches better (reuse the same packed rhs in several - // consecutive kernels). - Index gm = 1; - Index gn = 1; - // If we are sharding by column, then we prefer to reduce rows first. - if (shard_by_col) { - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - } else { - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - } - // Number of tasks in each dimension. - Index nm = divup(nm0, gm); - Index nn = divup(nn0, gn); - - // Last by not least, decide whether we want to issue both lhs and rhs - // packing in parallel; or issue lhs packing first, and then issue rhs - // packing when lhs packing completes (for !shard_by_col lhs and rhs are - // swapped). Parallel packing allows more parallelism (for both packing and - // kernels), while sequential packing provides better locality (once - // a thread finishes rhs packing it proceed to kernels with that rhs). - // First, we are interested in parallel packing if there are few tasks. - bool parallel_pack = num_threads >= nm * nn; - // Also do parallel packing if all data fits into L2$. - if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= - l2CacheSize() * num_threads) - parallel_pack = true; - // But don't do it if we will use each rhs only once. Locality seems to be - // more important in this case. - if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; - - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); - - Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); - } - - // Context coordinates a single parallel gemm operation. - template - class Context { - public: - Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) - : device_(device), - lhs_(lhs), - rhs_(rhs), - buffer_(buffer), - output_(buffer, tm), - num_threads_(num_threads), - shard_by_col_(shard_by_col), - parallel_pack_(parallel_pack), - m_(tm), - n_(tn), - k_(tk), - bm_(bm), - bn_(bn), - bk_(bk), - nm_(nm), - nn_(nn), - nk_(nk), - gm_(gm), - gn_(gn), - nm0_(nm0), - nn0_(nn0) - { - for (Index x = 0; x < P; x++) { - // Normal number of notifications for k slice switch is - // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only - // nm_ + nn_ notifications, because they will not receive notifications - // from preceeding kernels. - state_switch_[x] = - x == 0 - ? 1 - : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + - (x == P - 1 ? nm_ * nn_ : 0); - state_packing_ready_[x] = - parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); - state_kernel_[x] = new std::atomic*[nm_]; - for (Index m = 0; m < nm_; m++) { - state_kernel_[x][m] = new std::atomic[nn_]; - // Kernels generally receive 3 notifications (previous kernel + 2 - // packing), but the first slice won't get notifications from previous - // kernels. - for (Index n = 0; n < nn_; n++) - state_kernel_[x][m][n].store( - (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), - std::memory_order_relaxed); - } - } - - // Allocate memory for packed rhs/lhs matrices. - size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - size_t lhs_size = - divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; - } - } - } - - ~Context() { - for (Index x = 0; x < P; x++) { - for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; - delete[] state_kernel_[x]; - } - internal::aligned_free(packed_mem_); - } - - void run() { - // Kick off packing of the first slice. - signal_switch(0, 1); - // Wait for overall completion. - // TODO(dvyukov): this wait can lead to deadlock. - // If nthreads contractions are concurrently submitted from worker - // threads, this wait will block all worker threads and the system will - // deadlock. - done_.Wait(); - } - - private: - Notification done_; - const Device& device_; - LhsMapper& lhs_; - RhsMapper& rhs_; - Scalar* const buffer_; - OutputMapper output_; - const int num_threads_; - const bool shard_by_col_; - const bool parallel_pack_; - // Matrix sizes. - const Index m_; - const Index n_; - const Index k_; - // Block sizes. - const Index bm_; - const Index bn_; - const Index bk_; - // Number of tasks. - const Index nm_; - const Index nn_; - const Index nk_; - // Task grain sizes (number of kernels executed per task). - const Index gm_; - const Index gn_; - // Number of blocks (this is different from ni_/nn_ because of task size - // coarsening). - const Index nm0_; - const Index nn0_; - - // Parallelization strategy. - // - // Blocks related to the same k block can run in parallel because they write - // to different output blocks. So we parallelize within k slices, this - // gives us parallelism level of m x n. Before we can start any kernels - // related to k-th slice, we need to issue m lhs packing tasks and n rhs - // packing tasks. - // - // However, there is a bottleneck when we are finishing kernels for k-th - // slice (at the very end there is only 1 runnable kernel). To mitigate this - // bottleneck we allow kernels from k-th and k+1-th slices to run in - // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same - // output block, so they must not run in parallel. - // - // This gives us the following dependency graph. - // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs - // packing tasks. - // Kernel (m, n, k) can start when: - // - kernel (m, n, k-1) has finished - // - lhs packing (m, k) has finished - // - rhs packing (n, k) has finished - // Lhs/rhs packing can start when: - // - all k-1 packing has finished (artificially imposed to limit amount of - // parallel packing) - // - // On top of that we limit runnable tasks to two consecutive k slices. - // This is done to limit amount of memory we need for packed lhs/rhs - // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). - // - // state_switch_ tracks when we are ready to switch to the next k slice. - // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). - // These variable are rolling over 3 consecutive k slices: first two we are - // actively executing + one to track completion of kernels in the second - // slice. - static const Index P = 3; - void* packed_mem_; - std::vector packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; - std::atomic** state_kernel_[P]; - // state_switch_ is frequently modified by worker threads, while other - // fields are read-only after constructor. Let's move it to a separate cache - // line to reduce cache-coherency traffic. - char pad_[128]; - std::atomic state_packing_ready_[P]; - std::atomic state_switch_[P]; - - void pack_lhs(Index m, Index k) { - const Index mend = m * gm_ + gm(m); - for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); - - if (!parallel_pack_ && shard_by_col_) { - signal_packing(k); - } else { - signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); - } - } - - void pack_rhs(Index n, Index k) { - const Index nend = n * gn_ + gn(n); - for (Index n1 = n * gn_; n1 < nend; n1++) { - if (k == 0) { - // Zero the output memory in parallel. - // On 10000x2x10000 mm zeroing can easily take half of time. - // Zero (bn x m) row. Safe to do here because all kernels that will - // write to this memory depend on completion of this task. - // Note: don't call device_.memset() here. device_.memset() blocks on - // thread pool worker thread, which can lead to underutilization and - // deadlocks. - memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); - } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); - } - - if (parallel_pack_ || shard_by_col_) { - signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); - } else { - signal_packing(k); - } - } - - void kernel(Index m, Index n, Index k) { - // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consequetive tasks - // (rhs fits into L2$ while lhs only into L3$). - const Index nend = n * gn_ + gn(n); - const Index mend = m * gm_ + gm(m); - if (shard_by_col_) { - for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } else { - for (Index m1 = m * gm_; m1 < mend; m1++) - for (Index n1 = n * gn_; n1 < nend; n1++) { - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } - signal_kernel(m, n, k + 1, false); - signal_switch(k + 2); - } - - void signal_packing(Index k) { - eigen_assert(!parallel_pack_); - Index s = state_packing_ready_[k % P].fetch_sub(1); - eigen_assert(s > 0); - if (s != 1) return; - state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; - enqueue_packing(k, shard_by_col_); - } - - void signal_kernel(Index m, Index n, Index k, bool sync) { - std::atomic* state = &state_kernel_[k % P][m][n]; - Index s = state->load(); - eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; - state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); - } - - void signal_switch(Index k, Index v = 1) { - Index s = state_switch_[k % P].fetch_sub(v); - eigen_assert(s >= v); - if (s != v) return; - - // Ready to switch to the next k slice. - // Reset counter for the next iteration. - state_switch_[k % P] = - (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + - nm_ * nn_; - if (k < nk_) { - // Issue lhs/rhs packing. Their completion will in turn kick off - // kernels. - if (parallel_pack_) { - enqueue_packing(k, !shard_by_col_); - enqueue_packing(k, shard_by_col_); - } else if (shard_by_col_) { - enqueue_packing(k, false); - } else { - enqueue_packing(k, true); - } - - // Termination handling. - // Because kernel completion signals k + 2 switch, we need to finish nk - // + 2 slices without issuing any tasks on nk + 1 slice. So here we - // pretend that all nk + 1 packing tasks just finish instantly; so that - // nk + 2 switch only waits for completion of nk kernels. - } else if (k == nk_) { - signal_switch(k + 1, - parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); - } else { - done_.Notify(); - } - } - - // Enqueue all rhs/lhs packing for k-th slice. - void enqueue_packing(Index k, bool rhs) { - enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); - } - - void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { - if (end - start == 1) { - if (rhs) - pack_rhs(start, k); - else - pack_lhs(start, k); - } else { - Index mid = (start + end) / 2; - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(mid, end, k, rhs); }); - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(start, mid, k, rhs); }); - } - } - - // Block sizes with accounting for potentially incomplete last block. - Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } - Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } - Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } - // Task grain sizes accounting for potentially incomplete last task. - Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } - Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } - - Context(const Context&) = delete; - void operator=(const Context&) = delete; - }; - - // Decide whether we want to shard m x n contraction by columns or by rows. - static bool shardByCol(Index m, Index n, Index num_threads) { - // Note: we are comparing both n and m against Traits::nr, it is not - // a mistake. We are trying to figure out how both n and m will fit into - // the main sharding dimension. - - // Sharding by column is the default - // ... unless there is enough data for vectorization over rows - if (m / num_threads >= Traits::nr && - // and not enough data for vectorization over columns - (n / num_threads < Traits::nr || - // ... or barely enough data for vectorization over columns, - // but it is not evenly dividable across threads - (n / num_threads < 4 * Traits::nr && - (n % (num_threads * Traits::nr)) != 0 && - // ... and it is evenly dividable across threads for rows - ((m % (num_threads * Traits::nr)) == 0 || - // .. or it is not evenly dividable for both dimensions but - // there is much more data over rows so that corner effects are - // mitigated. - (m / n >= 6))))) - return false; - // Wait, or if matrices are just substantially prolonged over the other - // dimension. - if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; - return true; - } - - Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, - int num_threads, bool shard_by_col) const { - Index gm = 1; - Index gm1 = 1; - Index nm0 = divup(m, bm); - Index nm1 = nm0; - for (;;) { - // Find the next candidate for m grain size. It needs to result in - // different number of blocks. E.g. if we have 10 kernels, we want to try - // 5 and 10, but not 6, 7, 8 and 9. - while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; - if (gm1 > nm0) break; - // Check the candidate. - int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nm1 = divup(nm0, gm1); - if (res == 0) continue; - // Commit new grain size. - gm = gm1; - } - return gm; - } - - Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - int num_threads, bool shard_by_col) const { - Index gn = 1; - Index gn1 = 1; - Index nn0 = divup(n, bn); - Index nn1 = nn0; - for (;;) { - while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; - if (gn1 > nn0) break; - int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nn1 = divup(nn0, gn1); - if (res == 0) continue; - gn = gn1; - } - return gn; - } - - // checkGrain checks whether grain (gm, gn) is suitable and is better than - // (oldgm, oldgn). - int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - Index gn, Index oldgm, Index oldgn, int num_threads, - bool shard_by_col) const { - const TensorOpCost cost = - contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); - double taskSize = TensorCostModel::taskSize( - static_cast(bm) * gm * bn * gn, cost); - // If the task is too small, then we agree on it regardless of anything - // else. Otherwise synchronization overheads will dominate. - if (taskSize < 1) return 1; - // If it is too large, then we reject it and all larger tasks. - if (taskSize > 2) return -1; - // Now we are in presumably good task size range. - // The main deciding factor here is parallelism. Consider that we have 12 - // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. - // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 - // of cores will be busy). While grain size 3 gives us 4 tasks, which gives - // us parallelism of 1 (we can load all cores). - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index new_tasks = divup(nm0, gm) * divup(nn0, gn); - double new_parallelism = static_cast(new_tasks) / - (divup(new_tasks, num_threads) * num_threads); - Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); - double old_parallelism = static_cast(old_tasks) / - (divup(old_tasks, num_threads) * num_threads); - if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; - return 0; - } - -#else // EIGEN_USE_SIMPLE_THREAD_POOL - - template - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, - bool shard_by_col, bool prepacked) const { - const int packed_size = std::min(PacketType::size, - PacketType::size); - const int output_packet_size = internal::unpacket_traits::size; - const double kd = static_cast(bk); - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = bk == 1 ? 4.0 : - (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif - // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - if (prepacked) { - // Packing and kernels are executed in different tasks. When we calculate - // task grain size we look only at kernel cost assuming that kernel - // is more expensive than packing. - return cost; - } - // Lhs/rhs loads + computations. - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); - // Lhs packing memory cost does not contribute considerably to overall - // execution time because lhs is prefetched early and accessed sequentially. - if (shard_by_col) - lhsCost.dropMemoryCost(); - else - rhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_THREADS -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h deleted file mode 100644 index 860a6949..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ /dev/null @@ -1,279 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H - -namespace Eigen { - -/** \class TensorConversionOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor conversion class. This class makes it possible to vectorize - * type casting operations when the number of scalars per packet in the source - * and the destination type differ - */ -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef TargetType Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - enum { Flags = 0 }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConversionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConversionOp type; -}; - -} // end namespace internal - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - return internal::pcast(m_impl.template packet(index)); - } - - private: - const TensorEvaluator& m_impl; -}; - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); - SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2, src3, src4); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - // Only call m_impl.packet() when we have direct access to the underlying data. This - // ensures that we don't compute the subexpression twice. We may however load some - // coefficients twice, but in practice this doesn't negatively impact performance. - if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { - // Force unaligned memory loads since we can't ensure alignment anymore - return internal::pcast(m_impl.template packet(index)); - } else { - const int TgtPacketSize = internal::unpacket_traits::size; - typedef typename internal::unpacket_traits::type SrcType; - typedef typename internal::unpacket_traits::type TgtType; - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; - for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = converter(m_impl.coeff(index+i)); - } - TgtPacket rslt = internal::pload(values); - return rslt; - } - } - - private: - const TensorEvaluator& m_impl; - const typename TensorEvaluator::Index m_maxIndex; -}; - -template -class TensorConversionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::nested::type Nested; - typedef Scalar CoeffReturnType; - typedef typename NumTraits::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) - : m_xpr(xpr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { - impl.evalSubExprsIfNeeded(NULL); - return true; - } -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { - return impl.evalSubExprsIfNeeded(data); - } -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorConversionOp XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef TargetType Scalar; - typedef TargetType CoeffReturnType; - typedef typename internal::remove_all::Scalar>::type SrcType; - typedef typename PacketType::type PacketReturnType; - typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) - { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() - { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double cast_cost = TensorOpCost::CastCost(); - if (vectorized) { - const double SrcCoeffRatio = - internal::type_casting_traits::SrcCoeffRatio; - const double TgtCoeffRatio = - internal::type_casting_traits::TgtCoeffRatio; - return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + - TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); - } else { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); - } - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; - - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; - - TensorEvaluator m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h deleted file mode 100644 index abdf742c..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ /dev/null @@ -1,1104 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -namespace internal { - -template -class IndexMapper { - public: - IndexMapper(const InputDims& input_dims, const array& kernel_dims, - const array& indices) { - - array dimensions = input_dims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = indices[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - dimensions[index] = result_dim; - } - - array inputStrides; - array outputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; - outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } - } else { - inputStrides[NumDims - 1] = 1; - outputStrides[NumDims - 1] = 1; - for (int i = static_cast(NumDims) - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; - outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; - } - } - - array cudaInputDimensions; - array cudaOutputDimensions; - array tmp = dimensions; - array ordering; - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = i + offset; - ordering[index] = indices[i]; - tmp[indices[i]] = -1; - cudaInputDimensions[index] = input_dims[indices[i]]; - cudaOutputDimensions[index] = dimensions[indices[i]]; - } - - int written = static_cast(Layout) == static_cast(ColMajor) - ? NumKernelDims - : 0; - for (int i = 0; i < NumDims; ++i) { - if (tmp[i] >= 0) { - ordering[written] = i; - cudaInputDimensions[written] = input_dims[i]; - cudaOutputDimensions[written] = dimensions[i]; - ++written; - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[ordering[i]]; - m_outputStrides[i] = outputStrides[ordering[i]]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; - } - } - } - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; - } - inputIndex += p * m_inputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; - } - inputIndex += p * m_inputStrides[limit]; - } - return inputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { - Index outputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; - } - outputIndex += p * m_outputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; - } - outputIndex += p * m_outputStrides[limit]; - } - return outputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + - k * m_inputStrides[offset + 2]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + - k * m_outputStrides[offset + 2]; - } - - private: - static const int NumDims = internal::array_size::value; - array m_inputStrides; - array m_outputStrides; - array m_cudaInputStrides; - array m_cudaOutputStrides; -}; - - - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type::ret Scalar; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename InputXprType::Nested LhsNested; - typedef typename KernelXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConvolutionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConvolutionOp type; -}; - -} // end namespace internal - - - -template -class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) - : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - inputExpression() const { return m_input_xpr; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - kernelExpression() const { return m_kernel_xpr; } - - protected: - typename InputXprType::Nested m_input_xpr; - typename KernelXprType::Nested m_kernel_xpr; - const Indices m_indices; -}; - - -template -struct TensorEvaluator, Device> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; - } - } else { - m_inputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; - } - } - - m_dimensions = m_inputImpl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; - } else { - m_kernelStride[0] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; - } - } else { - for (int i = NumKernelDims - 1; i >= 0; --i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i < NumKernelDims - 1) { - m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; - } else { - m_kernelStride[NumKernelDims - 1] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_inputImpl.evalSubExprsIfNeeded(NULL); - preloadKernel(); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - void evalTo(typename XprType::Scalar* buffer) { - evalSubExprsIfNeeded(NULL); - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - CoeffReturnType result = CoeffReturnType(0); - convolve(firstInput(index), 0, NumKernelDims-1, result); - return result; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const - { - Index indices[2] = {index, index+PacketSize-1}; - Index startInputs[2] = {0, 0}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } - startInputs[0] += indices[0]; - startInputs[1] += indices[1]; - - if (startInputs[1]-startInputs[0] == PacketSize-1) { - PacketReturnType result = internal::pset1(0); - convolvePacket(startInputs[0], 0, NumKernelDims-1, result); - return result; - } else { - EIGEN_ALIGN_MAX Scalar data[PacketSize]; - data[0] = Scalar(0); - convolve(startInputs[0], 0, NumKernelDims-1, data[0]); - for (int i = 1; i < PacketSize-1; ++i) { - data[i] = Scalar(0); - convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); - } - data[PacketSize-1] = Scalar(0); - convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); - return internal::pload(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } - startInput += index; - return startInput; - } - - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolve(input, kernel, DimIndex-1, accum); - } else { - accum += m_inputImpl.coeff(input) * m_kernel[kernel]; - } - } - } - - template - EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolvePacket(input, kernel, DimIndex-1, accum); - } else { - accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - array m_inputStride; - array m_outputStride; - - array m_indexStride; - array m_kernelStride; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - Dimensions m_dimensions; - - KernelArgType m_kernelArg; - const Scalar* m_kernel; - bool m_local_kernel; - const Device& m_device; -}; - - - - -// Use an optimized implementation of the evaluation code for GPUs whenever possible. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -template -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { - return StaticKernelSize; - } -}; -template <> -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { - return kernelSize; - } -}; - -template -__global__ void EigenConvolutionKernel1D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int kernelSize, float* buffer) { - extern __shared__ float s[]; - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); - const int num_x_output = last_x - first_x + 1; - - const int first_plane = blockIdx.y * blockDim.y; - const int plane_stride = blockDim.y * gridDim.y; - - for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { - // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.y * num_x_input; - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); - } - - __syncthreads(); - - // Compute the convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - const int kernel_offset = plane_kernel_offset + i; - float result = 0.0f; - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { - result += s[k + kernel_offset] * kernel[k]; - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); - buffer[tensor_index] = result; - } - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel2D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int numY, const int maxY, const int kernelSizeX, - const int kernelSizeY, float* buffer) { - extern __shared__ float s[]; - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); - const int num_x_output = last_x - first_x + 1; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); - const int num_y_output = last_y - first_y + 1; - - const int first_plane = blockIdx.z * blockDim.z; - const int plane_stride = blockDim.z * gridDim.z; - - for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.z * num_y_input; - - // Load inputs to shared memory - #pragma unroll - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - const int input_offset = num_x_input * (j + plane_kernel_offset); - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); - s[i + input_offset] = eval.coeff(tensor_index); - } - } - - __syncthreads(); - - // Convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - #pragma unroll - for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { - const int kernel_offset = kernelSizeX * l; - const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { - result += s[k + input_offset] * kernel[k + kernel_offset]; - } - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); - buffer[tensor_index] = result; - } - } - - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel3D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const size_t numPlanes, const size_t numX, - const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, - const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, - const size_t kernelSizeZ, float* buffer) { - extern __shared__ float s[]; - - // Load inputs to shared memory - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; - - const int first_z = blockIdx.z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; - - for (int p = 0; p < numPlanes; ++p) { - - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = 0; - - for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); - s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); - } - } - } - - __syncthreads(); - - // Convolution - const int num_z_output = last_z - first_z + 1; - const int num_y_output = last_y - first_y + 1; - const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { - result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; - } - } - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); - buffer[tensor_index] = result; - } - } - } - __syncthreads(); - } -}; - - - -template -struct TensorEvaluator, GpuDevice> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - static unsigned int ceil(unsigned int num, unsigned int denom) { - const unsigned int rounded_toward_zero = num / denom; - if (num > rounded_toward_zero * denom) { - return rounded_toward_zero + 1; - } - return rounded_toward_zero; - } - - void executeEval(Scalar* data) const { - typedef typename TensorEvaluator::Dimensions InputDims; - - const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); - const int warpSize = 32; - - switch (NumKernelDims) { - case 1: { - const int kernel_size = m_kernelImpl.dimensions().TotalSize(); - - const int numX = dimensions()[m_indices[0]]; - const int numP = dimensions().TotalSize() / numX; - int maxX; - dim3 block_size; - - const int single_stride_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 0 - : m_inputImpl.dimensions().rank() - 1; - if (m_indices[0] == single_stride_dim) { - // Maximum the reuse - const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; - maxX = numext::mini(inner_dim, numX); - const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); - block_size.x = numext::mini(maxThreadsPerBlock, maxX); - block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); - } - else { - // Read as much as possible alongside the inner most dimension, that is the plane - const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); - const int maxP = numext::mini(inner_dim, numP); - maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); - - block_size.x = numext::mini(warpSize, maxX); - block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); - } - - const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); - - dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); - - - //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[0]); - const array kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch(kernel_size) { - case 4: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); - break; - } - case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); - } - } - break; - } - - case 2: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numP = dimensions().TotalSize() / (numX*numY); - - const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); - - // Snap maxX to warp size - int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; - const int maxX = numext::mini(inner_dim, numX); - const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); - const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); - - dim3 block_size; - block_size.x = numext::mini(1024, maxX); - block_size.y = numext::mini(1024/block_size.x, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); - - const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int num_y_blocks = ceil(numY, maxY); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); - - dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); - - - //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[idxX], m_indices[idxY]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch (kernel_size_x) { - case 4: { - switch (kernel_size_y) { - case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); - break; - } - } - break; - } - case 7: { - switch (kernel_size_y) { - case 4: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); - break; - } - } - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); - break; - } - } - break; - } - - case 3: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; - const int idxZ = - static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; - - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numZ = dimensions()[m_indices[idxZ]]; - const int numP = dimensions().TotalSize() / (numX*numY*numZ); - - const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); - const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); - const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); - - dim3 block_size; - block_size.x = numext::mini(32, maxX); - block_size.y = numext::mini(32, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); - dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); - - const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array indices(m_indices[idxX], m_indices[idxY], - m_indices[idxZ]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY], - m_kernelImpl.dimensions()[idxZ]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - KernelArgType m_kernelArg; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - - const GpuDevice& m_device; -}; -#endif - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h deleted file mode 100644 index 83c449cf..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ /dev/null @@ -1,212 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Rasmus Munk Larsen -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief A cost model used to limit the number of threads used for evaluating - * tensor expression. - * - */ - -// Class storing the cost of evaluating a tensor expression in terms of the -// estimated number of operand bytes loads, bytes stored, and compute cycles. -class TensorOpCost { - public: - // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple - // model based on minimal reciprocal throughput numbers from Intel or - // Agner Fog's tables would be better than what is there now. - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { - return internal::functor_traits< - internal::scalar_product_op >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { - return internal::functor_traits >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { - return internal::functor_traits< - internal::scalar_quotient_op >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { - return internal::functor_traits >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { - return internal::functor_traits< - internal::scalar_cast_op >::Cost; - } - - EIGEN_DEVICE_FUNC - TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(compute_cycles) {} - - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, - bool vectorized, double packet_size) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(vectorized ? compute_cycles / packet_size - : compute_cycles) { - eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); - eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); - eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { - return bytes_loaded_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { - return bytes_stored_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { - return compute_cycles_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( - double load_cost, double store_cost, double compute_cost) const { - return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + - compute_cost * compute_cycles_; - } - - // Drop memory access component. Intended for cases when memory accesses are - // sequential or are completely masked by computations. - EIGEN_DEVICE_FUNC void dropMemoryCost() { - bytes_loaded_ = 0; - bytes_stored_ = 0; - } - - // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( - const TensorOpCost& rhs) { - bytes_loaded_ += rhs.bytes_loaded(); - bytes_stored_ += rhs.bytes_stored(); - compute_cycles_ += rhs.compute_cycles(); - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { - bytes_loaded_ *= rhs; - bytes_stored_ *= rhs; - compute_cycles_ *= rhs; - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( - TensorOpCost lhs, const TensorOpCost& rhs) { - lhs += rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - TensorOpCost lhs, double rhs) { - lhs *= rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - double lhs, TensorOpCost rhs) { - rhs *= lhs; - return rhs; - } - - friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { - return os << "[bytes_loaded = " << tc.bytes_loaded() - << ", bytes_stored = " << tc.bytes_stored() - << ", compute_cycles = " << tc.compute_cycles() << "]"; - } - - private: - double bytes_loaded_; - double bytes_stored_; - double compute_cycles_; -}; - -// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads -// in [1:max_threads] instead of just switching multi-threading off for small -// work units. -template -class TensorCostModel { - public: - // Scaling from Eigen compute cost to device cycles. - static const int kDeviceCyclesPerComputeCycle = 1; - - // Costs in device cycles. - static const int kStartupCycles = 100000; - static const int kPerThreadCycles = 100000; - static const int kTaskSize = 40000; - - // Returns the number of threads in [1:max_threads] to use for - // evaluating an expression with the given output size and cost per - // coefficient. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( - double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); - } - - // taskSize assesses parallel task size. - // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task - // granularity needs to be increased to mitigate parallelization overheads. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( - double output_size, const TensorOpCost& cost_per_coeff) { - return totalCost(output_size, cost_per_coeff) / kTaskSize; - } - - private: - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( - double output_size, const TensorOpCost& cost_per_coeff) { - // Cost of memory fetches from L2 cache. 64 is typical cache line size. - // 11 is L2 cache latency on Haswell. - // We don't know whether data is in L1, L2 or L3. But we are most interested - // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not intersting - // either because we are probably using all available threads already). - // And for the target time range, L2 seems to be what matters. Data set - // fitting into L1 is too small to take noticeable time. Data set fitting - // only into L3 presumably will take more than 10ms to load and process. - const double kLoadCycles = 1.0 / 64 * 11; - const double kStoreCycles = 1.0 / 64 * 11; - // Scaling from Eigen compute cost to device cycles. - return output_size * - cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, - kDeviceCyclesPerComputeCycle); - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h deleted file mode 100644 index e020d076..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ /dev/null @@ -1,313 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H -#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H - -namespace Eigen { - -/** \class TensorCustomUnaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template -struct traits > -{ - typedef typename XprType::Scalar Scalar; - typedef typename XprType::StorageKind StorageKind; - typedef typename XprType::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCustomUnaryOp& type; -}; - -template -struct nested > -{ - typedef TensorCustomUnaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) - : m_expr(expr), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomUnaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_expr; } - - protected: - typename XprType::Nested m_expr; - const CustomUnaryFunc m_func; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorCustomUnaryOp ArgType; - typedef typename internal::traits::Index Index; - static const int NumDims = internal::traits::NumDimensions; - typedef DSizes Dimensions; - typedef typename internal::remove_const::type Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.expression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - - protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result( - data, m_dimensions); - m_op.func().eval(m_op.expression(), result, m_device); - } - - Dimensions m_dimensions; - const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_result; -}; - - - -/** \class TensorCustomBinaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template -struct traits > -{ - typedef typename internal::promote_storage_type::ret Scalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCustomBinaryOp& type; -}; - -template -struct nested > -{ - typedef TensorCustomBinaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCustomBinaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::traits::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) - - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomBinaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const CustomBinaryFunc m_func; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorCustomBinaryOp XprType; - typedef typename internal::traits::Index Index; - static const int NumDims = internal::traits::NumDimensions; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - - protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result(data, m_dimensions); - m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); - } - - Dimensions m_dimensions; - const XprType m_op; - const Device& m_device; - CoeffReturnType* m_result; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h deleted file mode 100644 index 29e50a3b..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ /dev/null @@ -1,68 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H - -namespace Eigen { - -/** \class TensorDevice - * \ingroup CXX11_Tensor_Module - * - * \brief Pseudo expression providing an operator = that will evaluate its argument - * on the specified computing 'device' (GPU, thread pool, ...) - * - * Example: - * C.device(EIGEN_GPU) = A + B; - * - * Todo: operator *= and /=. - */ - -template class TensorDevice { - public: - TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} - - template - EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp Assign; - Assign assign(m_expression, other); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - template - EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; - Sum sum(m_expression, other); - typedef TensorAssignOp Assign; - Assign assign(m_expression, sum); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - template - EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; - Difference difference(m_expression, other); - typedef TensorAssignOp Assign; - Assign assign(m_expression, difference); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - protected: - const DeviceType& m_device; - ExpressionType& m_expression; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h deleted file mode 100644 index 4f5767bc..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ /dev/null @@ -1,337 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - -namespace Eigen { - -static const int kCudaScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - m_deviceProperties = new cudaDeviceProp[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" - << i - << ": " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - } - -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const cudaStream_t default_stream = cudaStreamDefault; - -class CudaStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - cudaGetDevice(&device_); - } else { - int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~CudaStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - return semaphore_; - } - - private: - const cudaStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) - << std::endl; - assert(err == cudaSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the CUDA runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef __CUDACC__ - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - - -// FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ - cudaError_t status = cudaDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h deleted file mode 100644 index 9d141395..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ /dev/null @@ -1,81 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H - - -namespace Eigen { - -// Default device for the machine (typically a single cpu core) -struct DefaultDevice { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ - // Running on the host CPU - return 1; -#else - // Running on a CUDA device - return 32; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#ifndef __CUDA_ARCH__ - // Running on the host CPU - return l1CacheSize(); -#else - // Running on a CUDA device, return the amount of shared memory available. - return 48*1024; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#ifndef __CUDA_ARCH__ - // Running single threaded on the host CPU - return l3CacheSize(); -#else - // Running on a CUDA device - return firstLevelCacheSize(); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ - // Running single threaded on the host CPU - // Should return an enum that encodes the ISA supported by the CPU - return 1; -#else - // Running on a CUDA device - return __CUDA_ARCH__ / 100; -#endif - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h deleted file mode 100644 index 7c039890..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ /dev/null @@ -1,122 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// Copyright (C) 2016 Benoit Steiner - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H - -namespace Eigen { -struct SyclDevice { - /// class members - /// sycl queue - mutable cl::sycl::queue m_queue; - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// creating device by using selector - template SyclDevice(dev_Selector s) - : -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - })) -#else - m_queue(cl::sycl::queue(s)) -#endif - {} - // destructor - ~SyclDevice() { deallocate_all(); } - - template void deallocate(T *p) const { - auto it = buffer_map.find(p); - if (it != buffer_map.end()) { - buffer_map.erase(it); - internal::aligned_free(p); - } - } - void deallocate_all() const { - std::map>::iterator it=buffer_map.begin(); - while (it!=buffer_map.end()) { - auto p=it->first; - buffer_map.erase(it); - internal::aligned_free(const_cast(p)); - it=buffer_map.begin(); - } - buffer_map.clear(); - } - - /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - ///the function then adds an entry by creating a sycl buffer for that particular pointer. - template inline cl::sycl::accessor - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); - } - - template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { - using Type = cl::sycl::buffer; - std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), - [](void *dataMem) { delete static_cast(dataMem); }))); - (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); - return ret; - } - - template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { - return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); - } - - /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { - return internal::aligned_malloc(8); - } - - // some runtime conditions that can be applied here - bool isDeviceSuitable() const { return true; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); - memcpy(host_acc.get_pointer(), src, n); - } - /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); - if (it != buffer_map.end()) { - auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); - memcpy(dst,host_acc.get_pointer(), n); - } else{ - eigen_assert("no device memory found. The memory might be destroyed before creation"); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return 1; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h deleted file mode 100644 index 069680a1..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ /dev/null @@ -1,279 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H - -namespace Eigen { - -// Use the SimpleThreadPool by default. We'll switch to the new non blocking -// thread pool later. -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif - - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_assert(((count << 1) >> 1) == count); - } - ~Barrier() { - eigen_assert((state_>>1) == 0); - } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - eigen_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - eigen_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; -}; - - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1) {}; -}; - - -// Runs an arbitrary function and then calls Notify() on the passed in -// Notification. -template struct FunctionWrapperWithNotification -{ - static void run(Notification* n, Function f, Args... args) { - f(args...); - if (n) { - n->Notify(); - } - } -}; - -template struct FunctionWrapperWithBarrier -{ - static void run(Barrier* b, Function f, Args... args) { - f(args...); - if (b) { - b->Notify(); - } - } -}; - -template -static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { - if (n) { - n->Wait(); - } -} - - -// Build a thread pool device on top the an existing pool of threads. -struct ThreadPoolDevice { - // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_STRONG_INLINE int numThreads() const { - return num_threads_; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - return l1CacheSize(); - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // The l3 cache size is shared between all the cores. - return l3CacheSize() / num_threads_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - // Should return an enum that encodes the ISA supported by the CPU - return 1; - } - - template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { - Notification* n = new Notification(); - pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); - return n; - } - - template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, - Function&& f, - Args&&... args) const { - pool_->Schedule(std::bind( - &FunctionWrapperWithBarrier::run, b, f, args...)); - } - - template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); - } - - // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if - // called from one of the threads in pool_. Returns -1 otherwise. - EIGEN_STRONG_INLINE int currentThreadId() const { - return pool_->CurrentThreadId(); - } - - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel - // efficiency. If block_align is not nullptr, it is called to round up the - // block size. - void parallelFor(Index n, const TensorOpCost& cost, - std::function block_align, - std::function f) const { - typedef TensorCostModel CostModel; - if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { - f(0, n); - return; - } - - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. - - double block_size_f = 1.0 / CostModel::taskSize(1, cost); - Index block_size = numext::mini(n, numext::maxi(1, block_size_f)); - const Index max_block_size = - numext::mini(n, numext::maxi(1, 2 * block_size_f)); - if (block_align) { - Index new_block_size = block_align(block_size); - eigen_assert(new_block_size >= block_size); - block_size = numext::mini(n, new_block_size); - } - Index block_count = divup(n, block_size); - // Calculate parallel efficiency as fraction of total CPU time used for - // computations: - double max_efficiency = - static_cast(block_count) / - (divup(block_count, numThreads()) * numThreads()); - // Now try to increase block size up to max_block_size as long as it - // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; prev_block_count > 1;) { - // This is the next block size that divides size into a smaller number - // of blocks than the current block_size. - Index coarser_block_size = divup(n, prev_block_count - 1); - if (block_align) { - Index new_block_size = block_align(coarser_block_size); - eigen_assert(new_block_size >= coarser_block_size); - coarser_block_size = numext::mini(n, new_block_size); - } - if (coarser_block_size > max_block_size) { - break; // Reached max block size. Stop. - } - // Recalculate parallel efficiency. - const Index coarser_block_count = divup(n, coarser_block_size); - eigen_assert(coarser_block_count < prev_block_count); - prev_block_count = coarser_block_count; - const double coarser_efficiency = - static_cast(coarser_block_count) / - (divup(coarser_block_count, numThreads()) * numThreads()); - if (coarser_efficiency + 0.01 >= max_efficiency) { - // Taking it. - block_size = coarser_block_size; - block_count = coarser_block_count; - if (max_efficiency < coarser_efficiency) { - max_efficiency = coarser_efficiency; - } - } - } - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { - // Single block or less, execute directly. - f(first, last); - barrier.Notify(); - return; - } - // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); - }; - handleRange(0, n); - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); - } - - private: - ThreadPoolInterface* pool_; - int num_threads_; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h deleted file mode 100644 index 1a30e45f..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H - -namespace Eigen { - -/** \internal - * - * \class TensorDimensionList - * \ingroup CXX11_Tensor_Module - * - * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. - * - * \sa Tensor - */ - -template struct DimensionList { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - const Index operator[] (const Index i) const { return i; } -}; - -namespace internal { - -template struct array_size > { - static const size_t value = Rank; -}; -template struct array_size > { - static const size_t value = Rank; -}; - -template const Index array_get(DimensionList&) { - return n; -} -template const Index array_get(const DimensionList&) { - return n; -} - - -#if EIGEN_HAS_CONSTEXPR -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template -struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template -struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template -struct index_statically_eq_impl > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; -template -struct index_statically_ne_impl > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; - -#else -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template -struct indices_statically_known_to_increase_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template -struct indices_statically_known_to_increase_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template -struct index_statically_eq_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_eq_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ - return false; - } -}; -template -struct index_statically_ne_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_gt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_lt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -#endif - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h deleted file mode 100644 index b24cdebf..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ /dev/null @@ -1,428 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H - - -namespace Eigen { - -/** \internal - * - * \class TensorDimensions - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode and store the dimensions of a Tensor. - * - * The Sizes class encodes as part of the type the number of dimensions and the - * sizes corresponding to each dimension. It uses no storage space since it is - * entirely known at compile time. - * The DSizes class is its dynamic sibling: the number of dimensions is known - * at compile time but the sizes are set during execution. - * - * \sa Tensor - */ - -// Boilerplate code -namespace internal { - -template struct dget { - static const std::size_t value = get::value; -}; - - -template -struct fixed_size_tensor_index_linearization_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(array const& indices, - const Dimensions& dimensions) - { - return array_get(indices) + - dget::value * - fixed_size_tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct fixed_size_tensor_index_linearization_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(array const&, const Dimensions&) - { - return 0; - } -}; - -template -struct fixed_size_tensor_index_extraction_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(const Index index, - const Dimensions& dimensions) - { - const Index mult = (index == n-1) ? 1 : 0; - return array_get(dimensions) * mult + - fixed_size_tensor_index_extraction_helper::run(index, dimensions); - } -}; - -template -struct fixed_size_tensor_index_extraction_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(const Index, - const Dimensions&) - { - return 0; - } - }; - -} // end namespace internal - - -// Fixed size -#ifndef EIGEN_EMULATE_CXX11_META_H -template -struct Sizes : internal::numeric_list { - typedef internal::numeric_list Base; - static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { - return Base::count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { - return internal::arg_prod(Indices...); - } - - EIGEN_DEVICE_FUNC Sizes() { } - template - explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { - // todo: add assertion - } -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } - explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { - // todo: add assertion - } -#endif - - template Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { - return Sizes::total_size; -} -} - -#else - -template -struct non_zero_size { - typedef internal::type2val type; -}; -template <> -struct non_zero_size<0> { - typedef internal::null_type type; -}; - -template struct Sizes { - typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; - static const size_t count = Base::count; - static const std::size_t total_size = internal::arg_prod::value; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { - return count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { - return internal::arg_prod::value; - } - - Sizes() { } - template - explicit Sizes(const array& /*indices*/) { - // todo: add assertion - } - template Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list) { - // todo: add assertion - } -#else - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex operator[] (const int index) const { - switch (index) { - case 0: - return internal::get<0, Base>::value; - case 1: - return internal::get<1, Base>::value; - case 2: - return internal::get<2, Base>::value; - case 3: - return internal::get<3, Base>::value; - case 4: - return internal::get<4, Base>::value; - default: - eigen_assert(false && "index overflow"); - return static_cast(-1); - } - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); - } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { - return Sizes::total_size; -} -} - -#endif - -// Boilerplate -namespace internal { -template -struct tensor_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, array const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, array const&) - { - return array_get(indices); - } -}; -} // end namespace internal - - - -// Dynamic size -template -struct DSizes : array { - typedef array Base; - static const int count = NumDims; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { - return NumDims; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { - return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = 0; - } - } - EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } - - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { - eigen_assert(NumDims == 1); - (*this)[0] = i0; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { - eigen_assert(NumDims == 2); - (*this)[0] = i0; - (*this)[1] = i1; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { - eigen_assert(NumDims == 3); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { - eigen_assert(NumDims == 4); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { - eigen_assert(NumDims == 5); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - (*this)[4] = i4; - } -#endif - - EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { - *static_cast(this) = other; - return *this; - } - - // A constexpr would be so much better here - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - - - - -// Boilerplate -namespace internal { -template -struct tensor_vsize_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, std::vector const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_vsize_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_vsize_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, std::vector const&) - { - return array_get(indices); - } -}; -} // end namespace internal - - -namespace internal { - -template struct array_size > { - static const size_t value = NumDims; -}; -template struct array_size > { - static const size_t value = NumDims; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template struct array_size > { -static const std::ptrdiff_t value = Sizes::count; -}; -template struct array_size > { -static const std::ptrdiff_t value = Sizes::count; -}; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { - return get >::value; -} -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { - eigen_assert(false && "should never be called"); - return -1; -} -#else -template struct array_size > { - static const size_t value = Sizes::count; -}; -template struct array_size > { - static const size_t value = Sizes::count; -}; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { - return get::Base>::value; -} - -#endif - - -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { - return false; - } -}; -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { - return (array_get(dims1) == array_get(dims2)) & - sizes_match_below_dim::run(dims1, dims2); - } -}; -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { - return true; - } -}; - -} // end namespace internal - - -template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { - return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h deleted file mode 100644 index 06987132..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ /dev/null @@ -1,181 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H - -namespace Eigen { - -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template class MakePointer_> -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; - template - struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorEvalToOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorEvalToOp type; -}; - -} // end namespace internal - - - - -template class MakePointer_> -class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename MakePointer_::Type PointerType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) - : m_xpr(expr), m_buffer(buffer) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } - - protected: - typename XprType::Nested m_xpr; - PointerType m_buffer; -}; - - - -template class MakePointer_> -struct TensorEvaluator, Device> -{ - typedef TensorEvalToOp XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } - - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { - } - - typedef typename internal::traits >::template MakePointer::Type DevicePointer; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { - EIGEN_UNUSED_VARIABLE(scalar); - eigen_assert(scalar == NULL); - return m_impl.evalSubExprsIfNeeded(m_buffer); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_buffer[i] = m_impl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here. - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } - ArgType expression() const { return m_expression; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - - private: - TensorEvaluator m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; - const ArgType m_expression; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h deleted file mode 100644 index 834ce07d..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ /dev/null @@ -1,633 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor evaluator classes. - * - * These classes are responsible for the evaluation of the tensor expression. - * - * TODO: add support for more types of expressions, in particular expressions - * leading to lvalues (slicing, reshaping, etc...) - */ - -// Generic evaluator -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return m_data[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); - return m_data[index]; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt(m_data + index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - return internal::pstoret(m_data + index, x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - -namespace { -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T loadConstant(const T* address) { - return *address; -} -// Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float loadConstant(const float* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double loadConstant(const double* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -Eigen::half loadConstant(const Eigen::half* address) { - return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); -} -#endif -} - - -// Default evaluator for rvalues -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return loadConstant(m_data+index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt_ro(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) - : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - - - - -// -------------------- CwiseNullaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseNullaryOp XprType; - - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_wrapper(m_functor, index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_wrapper.template packetOp(m_functor, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } - - - private: - const NullaryOp m_functor; - TensorEvaluator m_argImpl; - const internal::nullary_wrapper m_wrapper; -}; - - - -// -------------------- CwiseUnaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseUnaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression(), device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_argImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_argImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_argImpl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_argImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_argImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } - - - private: - const UnaryOp m_functor; - TensorEvaluator m_argImpl; -}; - - -// -------------------- CwiseBinaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseBinaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use right impl instead if right impl dimensions are known at compile time. - return m_leftImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_leftImpl.costPerCoeff(vectorized) + - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } - - private: - const BinaryOp m_functor; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; -}; - -// -------------------- CwiseTernaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseTernaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_arg1Impl(op.arg1Expression(), device), - m_arg2Impl(op.arg2Expression(), device), - m_arg3Impl(op.arg3Expression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - - eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use arg2 or arg3 dimensions if they are known at compile time. - return m_arg1Impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_arg1Impl.evalSubExprsIfNeeded(NULL); - m_arg2Impl.evalSubExprsIfNeeded(NULL); - m_arg3Impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_arg1Impl.cleanup(); - m_arg2Impl.cleanup(); - m_arg3Impl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_arg1Impl.costPerCoeff(vectorized) + - m_arg2Impl.costPerCoeff(vectorized) + - m_arg3Impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } - - private: - const TernaryOp m_functor; - TensorEvaluator m_arg1Impl; - TensorEvaluator m_arg2Impl; - TensorEvaluator m_arg3Impl; -}; - - -// -------------------- SelectOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorSelectOp XprType; - typedef typename XprType::Scalar Scalar; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_condImpl(op.ifExpression(), device), - m_thenImpl(op.thenExpression(), device), - m_elseImpl(op.elseExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use then or else impl instead if they happen to be known at compile time. - return m_condImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_condImpl.evalSubExprsIfNeeded(NULL); - m_thenImpl.evalSubExprsIfNeeded(NULL); - m_elseImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_condImpl.cleanup(); - m_thenImpl.cleanup(); - m_elseImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); - } - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_condImpl.costPerCoeff(vectorized) + - m_thenImpl.costPerCoeff(vectorized) - .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } - - private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h deleted file mode 100644 index f01d77c0..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ /dev/null @@ -1,288 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H - -namespace Eigen { - -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ -namespace internal { - -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - - -// Multicore strategy: the index space is partitioned and each partition is executed on a single core -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - return size; - } -}; - -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; - - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - Index i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - last_chunk_offset = last - PacketSize; - for (; i <= last_chunk_offset; i += PacketSize) { - evaluator.evalPacket(i); - } - } - for (; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - // Align block size to packet size and account for unrolling in run above. - if (size >= 16 * PacketSize) { - return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); - } - // Aligning to 4 * PacketSize would increase block size by more than 25%. - return (size + PacketSize - 1) & ~(PacketSize - 1); - } -}; - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { - typedef TensorEvaluator Evaluator; - Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); - }); -#else - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) - } - evaluator.cleanup(); - } -}; -#endif // EIGEN_USE_THREADS - - -// GPU: the evaluation of the expression is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); -}; - - -#if defined(__CUDACC__) -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; - - // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -__global__ void -__launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { - - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); -} - -/*static*/ -template -inline void TensorExecutor::run( - const Expression& expr, const GpuDevice& device) { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - - LAUNCH_CUDA_KERNEL( - (EigenMetaKernel, Index>), - num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - -#endif // __CUDACC__ -#endif // EIGEN_USE_GPU - -// SYCL Executor policy -#ifdef EIGEN_USE_SYCL - -template -class TensorExecutor { -public: - static inline void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); - } -}; - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h deleted file mode 100644 index 85dfc7a6..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ /dev/null @@ -1,371 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H - -namespace Eigen { - -/** \class TensorExpr - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor expression classes. - * - * The TensorCwiseNullaryOp class applies a nullary operators to an expression. - * This is typically used to generate constants. - * - * The TensorCwiseUnaryOp class represents an expression where a unary operator - * (e.g. cwiseSqrt) is applied to an expression. - * - * The TensorCwiseBinaryOp class represents an expression where a binary - * operator (e.g. addition) is applied to a lhs and a rhs expression. - * - */ -namespace internal { -template -struct traits > - : traits -{ - typedef traits XprTraits; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -} // end namespace internal - - - -template -class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef TensorCwiseNullaryOp Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const NullaryOp& functor() const { return m_functor; } - - protected: - typename XprType::Nested m_xpr; - const NullaryOp m_functor; -}; - - - -namespace internal { -template -struct traits > - : traits -{ - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename result_of::type Scalar; - typedef traits XprTraits; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseUnaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseUnaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const UnaryOp& functor() const { return m_functor; } - - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const UnaryOp m_functor; -}; - - -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs - // are different. - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename result_of< - BinaryOp(typename LhsXprType::Scalar, - typename RhsXprType::Scalar)>::type Scalar; - typedef traits XprTraits; - typedef typename promote_storage_type< - typename traits::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type< - typename traits::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseBinaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseBinaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const BinaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const BinaryOp m_functor; -}; - - -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the args are different. - typedef typename result_of< - TernaryOp(typename Arg1XprType::Scalar, - typename Arg2XprType::Scalar, - typename Arg3XprType::Scalar)>::type Scalar; - typedef traits XprTraits; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename Arg1XprType::Nested Arg1Nested; - typedef typename Arg2XprType::Nested Arg2Nested; - typedef typename Arg3XprType::Nested Arg3Nested; - typedef typename remove_reference::type _Arg1Nested; - typedef typename remove_reference::type _Arg2Nested; - typedef typename remove_reference::type _Arg3Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseTernaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseTernaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseTernaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) - : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const TernaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg1Expression() const { return m_arg1_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg2Expression() const { return m_arg2_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg3Expression() const { return m_arg3_xpr; } - - protected: - typename Arg1XprType::Nested m_arg1_xpr; - typename Arg2XprType::Nested m_arg2_xpr; - typename Arg3XprType::Nested m_arg3_xpr; - const TernaryOp m_functor; -}; - - -namespace internal { -template -struct traits > - : traits -{ - typedef typename traits::Scalar Scalar; - typedef traits XprTraits; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename IfXprType::Nested IfNested; - typedef typename ThenXprType::Nested ThenNested; - typedef typename ElseXprType::Nested ElseNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorSelectOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorSelectOp type; -}; - -} // end namespace internal - - -template -class TensorSelectOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC - TensorSelectOp(const IfXprType& a_condition, - const ThenXprType& a_then, - const ElseXprType& a_else) - : m_condition(a_condition), m_then(a_then), m_else(a_else) - { } - - EIGEN_DEVICE_FUNC - const IfXprType& ifExpression() const { return m_condition; } - - EIGEN_DEVICE_FUNC - const ThenXprType& thenExpression() const { return m_then; } - - EIGEN_DEVICE_FUNC - const ElseXprType& elseExpression() const { return m_else; } - - protected: - typename IfXprType::Nested m_condition; - typename ThenXprType::Nested m_then; - typename ElseXprType::Nested m_else; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h deleted file mode 100644 index 08eb5595..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ /dev/null @@ -1,651 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Jianwei Cui -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H -#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H - -// This code requires the ability to initialize arrays of constant -// values directly inside a class. -#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 - -namespace Eigen { - -/** \class TensorFFT - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor FFT class. - * - * TODO: - * Vectorize the Cooley Tukey and the Bluestein algorithm - * Add support for multithreaded evaluation - * Improve the performance on GPU - */ - -template struct MakeComplex { - template - EIGEN_DEVICE_FUNC - T operator() (const T& val) const { return val; } -}; - -template <> struct MakeComplex { - template - EIGEN_DEVICE_FUNC - std::complex operator() (const T& val) const { return std::complex(val, 0); } -}; - -template <> struct MakeComplex { - template - EIGEN_DEVICE_FUNC - std::complex operator() (const std::complex& val) const { return val; } -}; - -template struct PartOf { - template T operator() (const T& val) const { return val; } -}; - -template <> struct PartOf { - template T operator() (const std::complex& val) const { return val.real(); } -}; - -template <> struct PartOf { - template T operator() (const std::complex& val) const { return val.imag(); } -}; - -namespace internal { -template -struct traits > : public traits { - typedef traits XprTraits; - typedef typename NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename XprTraits::Scalar InputScalar; - typedef typename conditional::type OutputScalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> { - typedef const TensorFFTOp& type; -}; - -template -struct nested, 1, typename eval >::type> { - typedef TensorFFTOp type; -}; - -} // end namespace internal - -template -class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename internal::conditional::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) - : m_xpr(expr), m_fft(fft) {} - - EIGEN_DEVICE_FUNC - const FFT& fft() const { return m_fft; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& expression() const { - return m_xpr; - } - - protected: - typename XprType::Nested m_xpr; - const FFT m_fft; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> { - typedef TensorFFTOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename TensorEvaluator::Dimensions InputDimensions; - typedef internal::traits XprTraits; - typedef typename XprTraits::Scalar InputScalar; - typedef typename internal::conditional::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = true, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - } - } - m_size = m_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (data) { - evalToBuf(data); - return false; - } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); - evalToBuf(m_data); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_data) { - m_device.deallocate(m_data); - m_data = NULL; - } - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType - packet(Index index) const { - return internal::ploadt(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { - const bool write_to_out = internal::is_same::value; - ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); - - for (Index i = 0; i < m_size; ++i) { - buf[i] = MakeComplex::value>()(m_impl.coeff(i)); - } - - for (size_t i = 0; i < m_fft.size(); ++i) { - Index dim = m_fft[i]; - eigen_assert(dim >= 0 && dim < NumDims); - Index line_len = m_dimensions[dim]; - eigen_assert(line_len >= 1); - ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); - const bool is_power_of_two = isPowerOfTwo(line_len); - const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); - const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); - - ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); - if (!is_power_of_two) { - // Compute twiddle factors - // t_n = exp(sqrt(-1) * pi * n^2 / line_len) - // for n = 0, 1,..., line_len-1. - // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } - } - } - - for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { - const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); - - // get data into line_buf - const Index stride = m_strides[dim]; - if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - for (int j = 0; j < line_len; ++j, offset += stride) { - line_buf[j] = buf[offset]; - } - } - - // processs the line - if (is_power_of_two) { - processDataLineCooleyTukey(line_buf, line_len, log_len); - } - else { - processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); - } - - // write back - if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); - for (int j = 0; j < line_len; ++j, offset += stride) { - buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; - } - } - } - m_device.deallocate(line_buf); - if (!is_power_of_two) { - m_device.deallocate(a); - m_device.deallocate(b); - m_device.deallocate(pos_j_base_powered); - } - } - - if(!write_to_out) { - for (Index i = 0; i < m_size; ++i) { - data[i] = PartOf()(buf[i]); - } - m_device.deallocate(buf); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { - eigen_assert(x > 0); - return !(x & (x - 1)); - } - - // The composite number for padding, used in Bluestein's FFT algorithm - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { - Index i = 2; - while (i < 2 * n - 1) i *= 2; - return i; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { - Index log2m = 0; - while (m >>= 1) log2m++; - return log2m; - } - - // Call Cooley Tukey algorithm directly, data length must be power of 2 - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { - eigen_assert(isPowerOfTwo(line_len)); - scramble_FFT(line_buf, line_len); - compute_1D_Butterfly(line_buf, line_len, log_len); - } - - // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { - Index n = line_len; - Index m = good_composite; - ComplexScalar* data = line_buf; - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - a[i] = data[i] * numext::conj(pos_j_base_powered[i]); - } - else { - a[i] = data[i] * pos_j_base_powered[i]; - } - } - for (Index i = n; i < m; ++i) { - a[i] = ComplexScalar(0, 0); - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[i]); - } - } - for (Index i = n; i < m - n; ++i) { - b[i] = ComplexScalar(0, 0); - } - for (Index i = m - n; i < m; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[m-i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[m-i]); - } - } - - scramble_FFT(a, m); - compute_1D_Butterfly(a, m, log_len); - - scramble_FFT(b, m); - compute_1D_Butterfly(b, m, log_len); - - for (Index i = 0; i < m; ++i) { - a[i] *= b[i]; - } - - scramble_FFT(a, m); - compute_1D_Butterfly(a, m, log_len); - - //Do the scaling after ifft - for (Index i = 0; i < m; ++i) { - a[i] /= m; - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - data[i] = a[i] * numext::conj(pos_j_base_powered[i]); - } - else { - data[i] = a[i] * pos_j_base_powered[i]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { - eigen_assert(isPowerOfTwo(n)); - Index j = 1; - for (Index i = 1; i < n; ++i){ - if (j > i) { - std::swap(data[j-1], data[i-1]); - } - Index m = n >> 1; - while (m >= 2 && j > m) { - j -= m; - m >>= 1; - } - j += m; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { - ComplexScalar tmp = data[1]; - data[1] = data[0] - data[1]; - data[0] += tmp; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { - ComplexScalar tmp[4]; - tmp[0] = data[0] + data[1]; - tmp[1] = data[0] - data[1]; - tmp[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); - } else { - tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); - } - data[0] = tmp[0] + tmp[2]; - data[1] = tmp[1] + tmp[3]; - data[2] = tmp[0] - tmp[2]; - data[3] = tmp[1] - tmp[3]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { - ComplexScalar tmp_1[8]; - ComplexScalar tmp_2[8]; - - tmp_1[0] = data[0] + data[1]; - tmp_1[1] = data[0] - data[1]; - tmp_1[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); - } else { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); - } - tmp_1[4] = data[4] + data[5]; - tmp_1[5] = data[4] - data[5]; - tmp_1[6] = data[6] + data[7]; - if (Dir == FFT_FORWARD) { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); - } else { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); - } - tmp_2[0] = tmp_1[0] + tmp_1[2]; - tmp_2[1] = tmp_1[1] + tmp_1[3]; - tmp_2[2] = tmp_1[0] - tmp_1[2]; - tmp_2[3] = tmp_1[1] - tmp_1[3]; - tmp_2[4] = tmp_1[4] + tmp_1[6]; -// SQRT2DIV2 = sqrt(2)/2 -#define SQRT2DIV2 0.7071067811865476 - if (Dir == FFT_FORWARD) { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); - } else { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); - } - data[0] = tmp_2[0] + tmp_2[4]; - data[1] = tmp_2[1] + tmp_2[5]; - data[2] = tmp_2[2] + tmp_2[6]; - data[3] = tmp_2[3] + tmp_2[7]; - data[4] = tmp_2[0] - tmp_2[4]; - data[5] = tmp_2[1] - tmp_2[5]; - data[6] = tmp_2[2] - tmp_2[6]; - data[7] = tmp_2[3] - tmp_2[7]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( - ComplexScalar* data, Index n, Index n_power_of_2) { - // Original code: - // RealScalar wtemp = std::sin(M_PI/n); - // RealScalar wpi = -std::sin(2 * M_PI/n); - const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; - const RealScalar wpi = (Dir == FFT_FORWARD) - ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] - : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; - - const ComplexScalar wp(wtemp, wpi); - const ComplexScalar wp_one = wp + ComplexScalar(1, 0); - const ComplexScalar wp_one_2 = wp_one * wp_one; - const ComplexScalar wp_one_3 = wp_one_2 * wp_one; - const ComplexScalar wp_one_4 = wp_one_3 * wp_one; - const Index n2 = n / 2; - ComplexScalar w(1.0, 0.0); - for (Index i = 0; i < n2; i += 4) { - ComplexScalar temp0(data[i + n2] * w); - ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); - ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); - ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); - w = w * wp_one_4; - - data[i + n2] = data[i] - temp0; - data[i] += temp0; - - data[i + 1 + n2] = data[i + 1] - temp1; - data[i + 1] += temp1; - - data[i + 2 + n2] = data[i + 2] - temp2; - data[i + 2] += temp2; - - data[i + 3 + n2] = data[i + 3] - temp3; - data[i + 3] += temp3; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( - ComplexScalar* data, Index n, Index n_power_of_2) { - eigen_assert(isPowerOfTwo(n)); - if (n > 8) { - compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); - compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); - butterfly_1D_merge(data, n, n_power_of_2); - } else if (n == 8) { - butterfly_8(data); - } else if (n == 4) { - butterfly_4(data); - } else if (n == 2) { - butterfly_2(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { - Index result = 0; - - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > omitted_dim; --i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - else { - for (Index i = 0; i < omitted_dim; ++i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - // Value of index_coords[omitted_dim] is not determined to this step - return result; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { - Index result = base + offset * m_strides[omitted_dim] ; - return result; - } - - protected: - Index m_size; - const FFT& m_fft; - Dimensions m_dimensions; - array m_strides; - TensorEvaluator m_impl; - CoeffReturnType* m_data; - const Device& m_device; - - // This will support a maximum FFT size of 2^32 for each dimension - // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; - const RealScalar m_sin_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(-2), - RealScalar(-0.999999999999999), - RealScalar(-0.292893218813453), - RealScalar(-0.0761204674887130), - RealScalar(-0.0192147195967696), - RealScalar(-0.00481527332780311), - RealScalar(-0.00120454379482761), - RealScalar(-3.01181303795779e-04), - RealScalar(-7.52981608554592e-05), - RealScalar(-1.88247173988574e-05), - RealScalar(-4.70619042382852e-06), - RealScalar(-1.17654829809007e-06), - RealScalar(-2.94137117780840e-07), - RealScalar(-7.35342821488550e-08), - RealScalar(-1.83835707061916e-08), - RealScalar(-4.59589268710903e-09), - RealScalar(-1.14897317243732e-09), - RealScalar(-2.87243293150586e-10), - RealScalar( -7.18108232902250e-11), - RealScalar(-1.79527058227174e-11), - RealScalar(-4.48817645568941e-12), - RealScalar(-1.12204411392298e-12), - RealScalar(-2.80511028480785e-13), - RealScalar(-7.01277571201985e-14), - RealScalar(-1.75319392800498e-14), - RealScalar(-4.38298482001247e-15), - RealScalar(-1.09574620500312e-15), - RealScalar(-2.73936551250781e-16), - RealScalar(-6.84841378126949e-17), - RealScalar(-1.71210344531737e-17), - RealScalar(-4.28025861329343e-18) - }; - - // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); - const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(0.0), - RealScalar(-1.00000000000000e+00), - RealScalar(-7.07106781186547e-01), - RealScalar(-3.82683432365090e-01), - RealScalar(-1.95090322016128e-01), - RealScalar(-9.80171403295606e-02), - RealScalar(-4.90676743274180e-02), - RealScalar(-2.45412285229123e-02), - RealScalar(-1.22715382857199e-02), - RealScalar(-6.13588464915448e-03), - RealScalar(-3.06795676296598e-03), - RealScalar(-1.53398018628477e-03), - RealScalar(-7.66990318742704e-04), - RealScalar(-3.83495187571396e-04), - RealScalar(-1.91747597310703e-04), - RealScalar(-9.58737990959773e-05), - RealScalar(-4.79368996030669e-05), - RealScalar(-2.39684498084182e-05), - RealScalar(-1.19842249050697e-05), - RealScalar(-5.99211245264243e-06), - RealScalar(-2.99605622633466e-06), - RealScalar(-1.49802811316901e-06), - RealScalar(-7.49014056584716e-07), - RealScalar(-3.74507028292384e-07), - RealScalar(-1.87253514146195e-07), - RealScalar(-9.36267570730981e-08), - RealScalar(-4.68133785365491e-08), - RealScalar(-2.34066892682746e-08), - RealScalar(-1.17033446341373e-08), - RealScalar(-5.85167231706864e-09), - RealScalar(-2.92583615853432e-09) - }; -}; - -} // end namespace Eigen - -#endif // EIGEN_HAS_CONSTEXPR - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h deleted file mode 100644 index fcee5f60..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ /dev/null @@ -1,389 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H -#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H - -namespace Eigen { - -/** \class TensorFixedSize - * \ingroup CXX11_Tensor_Module - * - * \brief The fixed sized version of the tensor class. - * - * The fixed sized equivalent of - * Eigen::Tensor t(3, 5, 7); - * is - * Eigen::TensorFixedSize> t; - */ - -template -class TensorFixedSize : public TensorBase > -{ - public: - typedef TensorFixedSize Self; - typedef TensorBase > Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - static const int Options = Options_; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - typedef Dimensions_ Dimensions; - static const std::size_t NumIndices = Dimensions::count; - - protected: - TensorStorage m_storage; - - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - eigen_assert(checkIndexRange(indices)); - return coeff(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - eigen_assert(checkIndexRange(indices)); - return coeffRef(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) - : m_storage(other.m_storage) - { - } -#endif - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return true; - // check whether the indices are all >= 0 - /* array_apply_and_reduce(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce(indices, m_storage.dimensions());*/ - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h deleted file mode 100644 index bbd5eb37..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ /dev/null @@ -1,167 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H - -namespace Eigen { - -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -/// template class MakePointer_ is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler T* is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. -/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is T*. -namespace internal { -template class MakePointer_> -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorForcedEvalOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorForcedEvalOp type; -}; - -} // end namespace internal - - - -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - - -template class MakePointer_> -struct TensorEvaluator, Device> -{ - typedef TensorForcedEvalOp XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = (PacketSize > 1), - Layout = TensorEvaluator::Layout, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) - { } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); - // Should initialize the memory in case we're dealing with non POD types. - if (NumTraits::RequireInitialization) { - for (Index i = 0; i < numValues; ++i) { - new(m_buffer+i) CoeffReturnType(); - } - } - typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); - m_buffer = NULL; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } - - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} - private: - TensorEvaluator m_impl; - const ArgType m_op; - const Device& m_device; - typename MakePointer::Type m_buffer; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h deleted file mode 100644 index 52b803d7..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ /dev/null @@ -1,109 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H - -namespace Eigen { - -// MakePointer class is used as a container of the adress space of the pointer -// on the host and on the device. From the host side it generates the T* pointer -// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to -// T* m_data on the host. It is always called on the device. -// Specialisation of MakePointer class for creating the sycl buffer with -// map_allocator. -template struct MakePointer { - typedef T* Type; -}; - -template class MakePointer_ = MakePointer> class TensorMap; -template class Tensor; -template class TensorFixedSize; -template class TensorRef; -template class TensorBase; - -template class TensorCwiseNullaryOp; -template class TensorCwiseUnaryOp; -template class TensorCwiseBinaryOp; -template class TensorCwiseTernaryOp; -template class TensorSelectOp; -template class MakePointer_ = MakePointer > class TensorReductionOp; -template class TensorIndexTupleOp; -template class TensorTupleReducerOp; -template class TensorConcatenationOp; -template class TensorContractionOp; -template class TensorConversionOp; -template class TensorConvolutionOp; -template class TensorFFTOp; -template class TensorPatchOp; -template class TensorImagePatchOp; -template class TensorVolumePatchOp; -template class TensorBroadcastingOp; -template class TensorChippingOp; -template class TensorReshapingOp; -template class TensorLayoutSwapOp; -template class TensorSlicingOp; -template class TensorReverseOp; -template class TensorPaddingOp; -template class TensorShufflingOp; -template class TensorStridingOp; -template class TensorStridingSlicingOp; -template class TensorInflationOp; -template class TensorGeneratorOp; -template class TensorAssignOp; -template class TensorScanOp; - -template class TensorCustomUnaryOp; -template class TensorCustomBinaryOp; - -template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class MakePointer_ = MakePointer> class TensorForcedEvalOp; - -template class TensorDevice; -template struct TensorEvaluator; - -struct DefaultDevice; -struct ThreadPoolDevice; -struct GpuDevice; -struct SyclDevice; - -enum FFTResultType { - RealPart = 0, - ImagPart = 1, - BothParts = 2 -}; - -enum FFTDirection { - FFT_FORWARD = 0, - FFT_REVERSE = 1 -}; - - -namespace internal { - -template -struct IsVectorizable { - static const bool value = TensorEvaluator::PacketAccess; -}; - -template -struct IsVectorizable { - static const bool value = TensorEvaluator::PacketAccess && - TensorEvaluator::IsAligned; -}; - -template ::value> -class TensorExecutor; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h deleted file mode 100644 index d73f6dc6..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ /dev/null @@ -1,489 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H - -namespace Eigen { -namespace internal { - - -/** \internal - * \brief Template functor to compute the modulo between an array and a scalar. - */ -template -struct scalar_mod_op { - EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } - const Scalar m_divisor; -}; -template -struct functor_traits > -{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; - - -/** \internal - * \brief Template functor to compute the modulo between 2 arrays. - */ -template -struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } -}; -template -struct functor_traits > -{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; - -template -struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar - operator()(const Scalar& a, const Scalar& b) const { - return numext::fmod(a, b); - } -}; -template -struct functor_traits > { - enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. - PacketAccess = false }; -}; - - -/** \internal - * \brief Template functor to compute the sigmoid of a scalar - * \sa class CwiseUnaryOp, ArrayBase::sigmoid() - */ -template -struct scalar_sigmoid_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -template -struct functor_traits > { - enum { - Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, - PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && - packet_traits::HasNegate && packet_traits::HasExp - }; -}; - - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - -// Standard reduction functors -template struct SumReducer -{ - static const bool PacketAccess = packet_traits::HasAdd; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_sum_op sum_op; - *accum = sum_op(*accum, t); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = padd(*accum, p); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(0); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd - }; -}; - - -template struct MeanReducer -{ - static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; - static const bool IsStateful = true; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - MeanReducer() : scalarCount_(0), packetCount_(0) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { - internal::scalar_sum_op sum_op; - *accum = sum_op(*accum, t); - scalarCount_++; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { - (*accum) = padd(*accum, p); - packetCount_++; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(0); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1(packetCount_)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); - } - - protected: - DenseIndex scalarCount_; - DenseIndex packetCount_; -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd - }; -}; - - -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::lowest(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return -Eigen::NumTraits::infinity(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::highest(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::infinity(); - } -}; - - -template struct MaxReducer -{ - static const bool PacketAccess = packet_traits::HasMax; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t > *accum) { *accum = t; } - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmax(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::maxi(saccum, predux_max(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax - }; -}; - - -template struct MinReducer -{ - static const bool PacketAccess = packet_traits::HasMin; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t < *accum) { *accum = t; } - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmin(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::mini(saccum, predux_min(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin - }; -}; - - -template struct ProdReducer -{ - static const bool PacketAccess = packet_traits::HasMul; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_product_op prod_op; - (*accum) = prod_op(*accum, t); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmul(*accum, p); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(1); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_product_op prod_op; - return prod_op(saccum, predux_mul(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul - }; -}; - - -struct AndReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum && t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - - -struct OrReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum || t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return false; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - - -// Argmin/Argmax reducers -template struct ArgMaxTupleReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits::lowest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - - -template struct ArgMinTupleReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits::highest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - - -template -class GaussianGenerator { - public: - static const bool PacketAccess = false; - - EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, - const array& std_devs) - : m_means(means) - { - for (size_t i = 0; i < NumDims; ++i) { - m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; - } - } - - EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { - T tmp = T(0); - for (size_t i = 0; i < NumDims; ++i) { - T offset = coordinates[i] - m_means[i]; - tmp += offset * offset / m_two_sigmas[i]; - } - return numext::exp(-tmp); - } - - private: - array m_means; - array m_two_sigmas; -}; - -template -struct functor_traits > { - enum { - Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + - functor_traits >::Cost) + - functor_traits >::Cost, - PacketAccess = GaussianGenerator::PacketAccess - }; -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h deleted file mode 100644 index eb1d4934..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ /dev/null @@ -1,185 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H - -namespace Eigen { - -/** \class TensorGenerator - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor generator class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorGeneratorOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorGeneratorOp type; -}; - -} // end namespace internal - - - -template -class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) - : m_xpr(expr), m_generator(generator) {} - - EIGEN_DEVICE_FUNC - const Generator& generator() const { return m_generator; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Generator m_generator; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorGeneratorOp XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int NumDims = internal::array_size::value; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - enum { - IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) - { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - array coords; - extract_coordinates(index, coords); - return m_generator(coords); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool) const { - // TODO(rmlarsen): This is just a placeholder. Define interface to make - // generators return their cost. - return TensorOpCost(0, 0, TensorOpCost::AddCost() + - TensorOpCost::MulCost()); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void extract_coordinates(Index index, array& coords) const { - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[NumDims-1] = index; - } - } - - Dimensions m_dimensions; - array m_strides; - Generator m_generator; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h deleted file mode 100644 index 665b861c..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Eugene Brevdo -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H - -namespace Eigen { - -/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. - * - * This function computes the regularized incomplete beta function (integral). - * - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const - TensorCwiseTernaryOp, - const ADerived, const BDerived, const XDerived> - betainc(const ADerived& a, const BDerived& b, const XDerived& x) { - return TensorCwiseTernaryOp< - internal::scalar_betainc_op, const ADerived, - const BDerived, const XDerived>( - a, b, x, internal::scalar_betainc_op()); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h deleted file mode 100644 index a901c5dd..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ /dev/null @@ -1,79 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H -#define EIGEN_CXX11_TENSOR_TENSOR_IO_H - -namespace Eigen { - -namespace internal { - -// Print the tensor as a 2d matrix -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); - static const int layout = Tensor::Layout; - Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); - os << matrix; - } - } -}; - - -// Print the tensor as a vector -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - Map > array(const_cast(tensor.data()), total_size); - os << array; - } - } -}; - - -// Print the tensor as a scalar -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - os << tensor.coeff(0); - } -}; -} - -template -std::ostream& operator << (std::ostream& os, const TensorBase& expr) { - typedef TensorEvaluator, DefaultDevice> Evaluator; - typedef typename Evaluator::Dimensions Dimensions; - - // Evaluate the expression if needed - TensorForcedEvalOp eval = expr.eval(); - Evaluator tensor(eval, DefaultDevice()); - tensor.evalSubExprsIfNeeded(NULL); - - // Print the result - static const int rank = internal::array_size::value; - internal::TensorPrinter::run(os, tensor); - - // Cleanup. - tensor.cleanup(); - return os; -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h deleted file mode 100644 index 566856ed..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ /dev/null @@ -1,509 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H - -namespace Eigen { - -/** \class TensorImagePatch - * \ingroup CXX11_Tensor_Module - * - * \brief Patch extraction specialized for image processing. - * This assumes that the input has a least 3 dimensions ordered as follow: - * 1st dimension: channels (of size d) - * 2nd dimension: rows (of size r) - * 3rd dimension: columns (of size c) - * There can be additional dimensions such as time (for video) or batch (for - * bulk processing after the first 3. - * Calling the image patch code with patch_rows and patch_cols is equivalent - * to calling the regular patch extraction code with parameters d, patch_rows, - * patch_cols, and 1 for all the additional dimensions. - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename internal::remove_const::type Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorImagePatchOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorImagePatchOp type; -}; - -} // end namespace internal - -template -class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - DenseIndex padding_top, DenseIndex padding_bottom, - DenseIndex padding_left, DenseIndex padding_right, - Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - DenseIndex patch_rows() const { return m_patch_rows; } - EIGEN_DEVICE_FUNC - DenseIndex patch_cols() const { return m_patch_cols; } - EIGEN_DEVICE_FUNC - DenseIndex row_strides() const { return m_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_strides() const { return m_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_row_strides() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_col_strides() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC - bool padding_explicit() const { return m_padding_explicit; } - EIGEN_DEVICE_FUNC - DenseIndex padding_top() const { return m_padding_top; } - EIGEN_DEVICE_FUNC - DenseIndex padding_bottom() const { return m_padding_bottom; } - EIGEN_DEVICE_FUNC - DenseIndex padding_left() const { return m_padding_left; } - EIGEN_DEVICE_FUNC - DenseIndex padding_right() const { return m_padding_right; } - EIGEN_DEVICE_FUNC - PaddingType padding_type() const { return m_padding_type; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const DenseIndex m_patch_rows; - const DenseIndex m_patch_cols; - const DenseIndex m_row_strides; - const DenseIndex m_col_strides; - const DenseIndex m_in_row_strides; - const DenseIndex m_in_col_strides; - const DenseIndex m_row_inflate_strides; - const DenseIndex m_col_inflate_strides; - const bool m_padding_explicit; - const DenseIndex m_padding_top; - const DenseIndex m_padding_bottom; - const DenseIndex m_padding_left; - const DenseIndex m_padding_right; - const PaddingType m_padding_type; - const Scalar m_padding_value; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorImagePatchOp XprType; - typedef typename XprType::Index Index; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims + 1; - typedef DSizes Dimensions; - typedef typename internal::remove_const::type Scalar; - typedef TensorEvaluator, - Device> Self; - typedef TensorEvaluator Impl; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - - m_paddingValue = op.padding_value(); - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - - // Caches a few variables. - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputDepth = input_dims[0]; - m_inputRows = input_dims[1]; - m_inputCols = input_dims[2]; - } else { - m_inputDepth = input_dims[NumInputDims-1]; - m_inputRows = input_dims[NumInputDims-2]; - m_inputCols = input_dims[NumInputDims-3]; - } - - m_row_strides = op.row_strides(); - m_col_strides = op.col_strides(); - - // Input strides and effective input/patch size - m_in_row_strides = op.in_row_strides(); - m_in_col_strides = op.in_col_strides(); - m_row_inflate_strides = op.row_inflate_strides(); - m_col_inflate_strides = op.col_inflate_strides(); - // The "effective" input rows and input cols are the input rows and cols - // after inflating them with zeros. - // For examples, a 2x3 matrix with row_inflate_strides and - // col_inflate_strides of 2 comes from: - // A B C - // D E F - // - // to a matrix is 3 x 5: - // - // A . B . C - // . . . . . - // D . E . F - - m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; - m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; - m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); - m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); - - if (op.padding_explicit()) { - m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); - m_rowPaddingTop = op.padding_top(); - m_colPaddingLeft = op.padding_left(); - } else { - // Computing padding from the type - switch (op.padding_type()) { - case PADDING_VALID: - m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); - m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); - break; - case PADDING_SAME: - m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); - m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; - m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; - break; - default: - eigen_assert(false && "unexpected padding"); - } - } - eigen_assert(m_outputRows > 0); - eigen_assert(m_outputCols > 0); - - // Dimensions for result of extraction. - if (static_cast(Layout) == static_cast(ColMajor)) { - // ColMajor - // 0: depth - // 1: patch_rows - // 2: patch_cols - // 3: number of patches - // 4 and beyond: anything else (such as batch). - m_dimensions[0] = input_dims[0]; - m_dimensions[1] = op.patch_rows(); - m_dimensions[2] = op.patch_cols(); - m_dimensions[3] = m_outputRows * m_outputCols; - for (int i = 4; i < NumDims; ++i) { - m_dimensions[i] = input_dims[i-1]; - } - } else { - // RowMajor - // NumDims-1: depth - // NumDims-2: patch_rows - // NumDims-3: patch_cols - // NumDims-4: number of patches - // NumDims-5 and beyond: anything else (such as batch). - m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; - m_dimensions[NumDims-2] = op.patch_rows(); - m_dimensions[NumDims-3] = op.patch_cols(); - m_dimensions[NumDims-4] = m_outputRows * m_outputCols; - for (int i = NumDims-5; i >= 0; --i) { - m_dimensions[i] = input_dims[i]; - } - } - - // Strides for moving the patch in various dimensions. - if (static_cast(Layout) == static_cast(ColMajor)) { - m_colStride = m_dimensions[1]; - m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; - m_otherStride = m_patchStride * m_dimensions[3]; - } else { - m_colStride = m_dimensions[NumDims-2]; - m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; - m_otherStride = m_patchStride * m_dimensions[NumDims-4]; - } - - // Strides for navigating through the input tensor. - m_rowInputStride = m_inputDepth; - m_colInputStride = m_inputDepth * m_inputRows; - m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; - - // Fast representations of different variables. - m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); - m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); - m_fastColStride = internal::TensorIntDivisor(m_colStride); - m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); - m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); - m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); - - // Number of patches in the width dimension. - m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); - } else { - m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Patch index corresponding to the passed in index. - const Index patchIndex = index / m_fastPatchStride; - // Find the offset of the element wrt the location of the first element. - const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; - - // Other ways to index this element. - const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - - // Calculate col index in the input original tensor. - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - // Calculate row index in the original input tensor. - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffset = patchOffset - colOffset * m_colStride; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - - const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { - return packetWithPossibleZero(index); - } - - const Index indices[2] = {index, index + PacketSize - 1}; - const Index patchIndex = indices[0] / m_fastPatchStride; - if (patchIndex != indices[1] / m_fastPatchStride) { - return packetWithPossibleZero(index); - } - const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; - eigen_assert(otherIndex == indices[1] / m_fastOtherStride); - - // Find the offset of the element wrt the location of the first element. - const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, - (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; - - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; - eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - - // Calculate col indices in the original input tensor. - const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - - m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; - if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { - return internal::pset1(Scalar(m_paddingValue)); - } - - if (inputCols[0] == inputCols[1]) { - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; - eigen_assert(rowOffsets[0] <= rowOffsets[1]); - // Calculate col indices in the original input tensor. - const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - - m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; - - if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { - return internal::pset1(Scalar(m_paddingValue)); - } - - if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { - // no padding - const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.template packet(inputIndex); - } - } - - return packetWithPossibleZero(index); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - const TensorEvaluator& impl() const { return m_impl; } - - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We conservatively estimate the cost for the code path where the computed - // index is inside the original image and - // TensorEvaluator::CoordAccess is false. - const double compute_cost = 3 * TensorOpCost::DivCost() + - 6 * TensorOpCost::MulCost() + - 8 * TensorOpCost::MulCost(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - Dimensions m_dimensions; - - Index m_otherStride; - Index m_patchStride; - Index m_colStride; - Index m_row_strides; - Index m_col_strides; - - Index m_in_row_strides; - Index m_in_col_strides; - Index m_row_inflate_strides; - Index m_col_inflate_strides; - - Index m_input_rows_eff; - Index m_input_cols_eff; - Index m_patch_rows_eff; - Index m_patch_cols_eff; - - internal::TensorIntDivisor m_fastOtherStride; - internal::TensorIntDivisor m_fastPatchStride; - internal::TensorIntDivisor m_fastColStride; - internal::TensorIntDivisor m_fastInflateRowStride; - internal::TensorIntDivisor m_fastInflateColStride; - internal::TensorIntDivisor m_fastInputColsEff; - - Index m_rowInputStride; - Index m_colInputStride; - Index m_patchInputStride; - - Index m_inputDepth; - Index m_inputRows; - Index m_inputCols; - - Index m_outputRows; - Index m_outputCols; - - Index m_rowPaddingTop; - Index m_colPaddingLeft; - - internal::TensorIntDivisor m_fastOutputRows; - internal::TensorIntDivisor m_fastOutputDepth; - - Scalar m_paddingValue; - - TensorEvaluator m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h deleted file mode 100644 index 3209fecd..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ /dev/null @@ -1,725 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H - - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES - -#define EIGEN_HAS_INDEX_LIST - -namespace Eigen { - -/** \internal - * - * \class TensorIndexList - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode a set of Tensor dimensions/indices. - * - * The indices in the list can be known at compile time or at runtime. A mix - * of static and dynamic indices can also be provided if needed. The tensor - * code will attempt to take advantage of the indices that are known at - * compile time to optimize the code it generates. - * - * This functionality requires a c++11 compliant compiler. If your compiler - * is older you need to use arrays of indices instead. - * - * Several examples are provided in the cxx11_tensor_index_list.cpp file. - * - * \sa Tensor - */ - -template -struct type2index { - static const DenseIndex value = n; - EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } - EIGEN_DEVICE_FUNC void set(DenseIndex val) { - eigen_assert(val == n); - } -}; - -// This can be used with IndexPairList to get compile-time constant pairs, -// such as IndexPairList, type2indexpair<3,4>>(). -template -struct type2indexpair { - static const DenseIndex first = f; - static const DenseIndex second = s; - - constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { - return IndexPair(f, s); - } - - EIGEN_DEVICE_FUNC void set(const IndexPair& val) { - eigen_assert(val.first == f); - eigen_assert(val.second == s); - } -}; - - -template struct NumTraits > -{ - typedef DenseIndex Real; - enum { - IsComplex = 0, - RequireInitialization = false, - ReadCost = 1, - AddCost = 1, - MulCost = 1 - }; - - EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static inline Real highest() { return n; } - EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { - val = new_val; -} -template -EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { - val.set(new_val); -} - -template -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { - val = new_val; -} -template -EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { - val.set(new_val); -} - - -template -struct is_compile_time_constant { - static constexpr bool value = false; -}; - -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; - -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; - - -template -struct IndexTuple; - -template -struct IndexTuple { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } - - constexpr static int count = 1 + sizeof...(O); - T head; - IndexTuple others; - typedef T Head; - typedef IndexTuple Other; -}; - -template - struct IndexTuple { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } - - constexpr static int count = 1; - T head; - typedef T Head; -}; - - -template -struct IndexTupleExtractor; - -template -struct IndexTupleExtractor { - - typedef typename IndexTupleExtractor::ValType ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { - return IndexTupleExtractor::get_val(val.others); - } - - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { - return IndexTupleExtractor::get_val(val.others); - } - template - EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { - IndexTupleExtractor::set_val(val.others, new_val); - } - -}; - -template - struct IndexTupleExtractor<0, T, O...> { - - typedef T ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { - return val.head; - } - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { - return val.head; - } - template - EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { - val.head = new_val; - } -}; - - - -template -EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { - return IndexTupleExtractor::get_val(tuple); -} -template -EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { - return IndexTupleExtractor::get_val(tuple); -} -template - struct array_size > { - static const size_t value = IndexTuple::count; -}; -template - struct array_size > { - static const size_t value = IndexTuple::count; -}; - - - - -template -struct tuple_coeff { - template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { - // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); - return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); - } - template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { - if (i == Idx) { - update_value(array_get(t), value); - } else { - tuple_coeff::set(i, t, value); - } - } - - template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { - return ((i == Idx) & is_compile_time_constant::ValType>::value) || - tuple_coeff::value_known_statically(i, t); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { - return is_compile_time_constant::ValType>::value && - tuple_coeff::values_up_to_known_statically(t); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { - return is_compile_time_constant::ValType>::value && - is_compile_time_constant::ValType>::value && - array_get(t) > array_get(t) && - tuple_coeff::values_up_to_statically_known_to_increase(t); - } -}; - -template -struct tuple_coeff<0, ValueT> { - template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { - // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr - return array_get<0>(t)/* * (i == 0)*/; - } - template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { - eigen_assert (i == 0); - update_value(array_get<0>(t), value); - } - template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { - return is_compile_time_constant::ValType>::value & (i == 0); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { - return is_compile_time_constant::ValType>::value; - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { - return true; - } -}; -} // namespace internal - - - -template -struct IndexList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { - return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } - EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } - EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); - } - EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); - } - - EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); - } -}; - - -template -constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { - return IndexList(val1, other_vals...); -} - - -template -struct IndexPairList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { - return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } - EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); - } -}; - -namespace internal { - -template size_t array_prod(const IndexList& sizes) { - size_t result = 1; - for (int i = 0; i < array_size >::value; ++i) { - result *= sizes[i]; - } - return result; -} - -template struct array_size > { - static const size_t value = array_size >::value; -}; -template struct array_size > { - static const size_t value = array_size >::value; -}; - -template struct array_size > { - static const size_t value = std::tuple_size >::value; -}; -template struct array_size > { - static const size_t value = std::tuple_size >::value; -}; - -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { - return IndexTupleExtractor::get_val(a); -} -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { - return IndexTupleExtractor::get_val(a); -} - -template -struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return false; - } -}; - -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { - return IndexList().value_known_statically(i); - } -}; - -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { - return IndexList().value_known_statically(i); - } -}; - - -template -struct all_indices_known_statically_impl { - static constexpr bool run() { - return false; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList().all_values_known_statically(); - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList().all_values_known_statically(); - } -}; - - -template -struct indices_statically_known_to_increase_impl { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return false; - } -}; - -template - struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList().values_statically_known_to_increase(); - } -}; - -template - struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList().values_statically_known_to_increase(); - } -}; - - -template -struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) == value); - } -}; - -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) == value); - } -}; - - -template -struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) != value); - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) != value); - } -}; - - -template -struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) > value); - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) > value); - } -}; - - - -template -struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) < value); - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) < value); - } -}; - - - -template -struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).first == value); - } -}; - -template -struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).first == value); - } -}; - - - -template -struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).second == value); - } -}; - -template -struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).second == value); - } -}; - - -} // end namespace internal -} // end namespace Eigen - -#else - -namespace Eigen { -namespace internal { - -template -struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return false; - } -}; - -template -struct all_indices_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template -struct indices_statically_known_to_increase_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template -struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - - - -} // end namespace internal -} // end namespace Eigen - -#endif - - -namespace Eigen { -namespace internal { -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { - return index_known_statically_impl::run(i); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { - return all_indices_known_statically_impl::run(); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { - return indices_statically_known_to_increase_impl::run(); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { - return index_statically_eq_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { - return index_statically_ne_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { - return index_statically_gt_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { - return index_statically_lt_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { - return index_pair_first_statically_eq_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { - return index_pair_second_statically_eq_impl::run(i, value); -} - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h deleted file mode 100644 index f391fb9e..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ /dev/null @@ -1,229 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Ke Yang -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H - -namespace Eigen { - -/** \class TensorInflation - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor inflation class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorInflationOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorInflationOp type; -}; - -} // end namespace internal - -template -class TensorInflationOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) - : m_xpr(expr), m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const Strides& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Strides m_strides; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorInflationOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) - { - m_dimensions = m_impl.dimensions(); - // Expand each dimension to the inflated dimension. - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; - } - - // Remember the strides for fast division. - for (int i = 0; i < NumDims; ++i) { - m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - } else { // RowMajor - m_outputStrides[NumDims-1] = 1; - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - // Computes the input index given the output index. Returns true if the output - // index doesn't fall into a hole. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const - { - eigen_assert(index < dimensions().TotalSize()); - *inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[0] * m_strides[0]) { - return false; - } - *inputIndex += index / m_strides[0]; - return true; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { - return false; - } - *inputIndex += index / m_strides[NumDims - 1]; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index inputIndex = 0; - if (getInputIndex(index, &inputIndex)) { - return m_impl.coeff(inputIndex); - } else { - return Scalar(0); - } - } - - // TODO(yangke): optimize this function so that we can detect and produce - // all-zero packets - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + - 3 * TensorOpCost::MulCost() + - 2 * TensorOpCost::AddCost()); - const double input_size = m_impl.dimensions().TotalSize(); - const double output_size = m_dimensions.TotalSize(); - if (output_size == 0) - return TensorOpCost(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, - compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Strides m_strides; - array, NumDims> m_fastStrides; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h deleted file mode 100644 index 33edc49e..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ /dev/null @@ -1,82 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H -#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H - -#if EIGEN_HAS_VARIADIC_TEMPLATES - -#include - -namespace Eigen { - -/** \class TensorInitializer - * \ingroup CXX11_Tensor_Module - * - * \brief Helper template to initialize Tensors from std::initializer_lists. - */ -namespace internal { - -template -struct Initializer { - typedef std::initializer_list< - typename Initializer::InitList> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - for (auto v : vals) { - (*indices)[traits::NumDimensions - N] = i++; - Initializer::run(tensor, indices, v); - } - } -}; - -template -struct Initializer { - typedef std::initializer_list::Scalar> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - // There is likely a faster way to do that than iterating. - for (auto v : vals) { - (*indices)[traits::NumDimensions - 1] = i++; - tensor.coeffRef(*indices) = v; - } - } -}; - -template -struct Initializer { - typedef typename traits::Scalar InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>*, - const InitList& v) { - tensor.coeffRef(0) = v; - } -}; - - -template -void initialize_tensor(TensorEvaluator& tensor, - const typename Initializer::NumDimensions>::InitList& vals) { - Eigen::array::Index, traits::NumDimensions> indices; - Initializer::NumDimensions>::run(tensor, &indices, vals); -} - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h deleted file mode 100644 index ede3939c..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ /dev/null @@ -1,253 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H -#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H - - -namespace Eigen { - -/** \internal - * - * \class TensorIntDiv - * \ingroup CXX11_Tensor_Module - * - * \brief Fast integer division by a constant. - * - * See the paper from Granlund and Montgomery for explanation. - * (at http://dx.doi.org/10.1145/773473.178249) - * - * \sa Tensor - */ - -namespace internal { - -namespace { - - // Note: result is undefined if val == 0 - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::type count_leading_zeros(const T val) - { -#ifdef __CUDA_ARCH__ - return __clz(val); -#elif EIGEN_COMP_MSVC - unsigned long index; - _BitScanReverse(&index, val); - return 31 - index; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clz(static_cast(val)); -#endif - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::type count_leading_zeros(const T val) - { -#ifdef __CUDA_ARCH__ - return __clzll(val); -#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 - unsigned long index; - _BitScanReverse64(&index, val); - return 63 - index; -#elif EIGEN_COMP_MSVC - // MSVC's _BitScanReverse64 is not available for 32bits builds. - unsigned int lo = (unsigned int)(val&0xffffffff); - unsigned int hi = (unsigned int)((val>>32)&0xffffffff); - int n; - if(hi==0) - n = 32 + count_leading_zeros(lo); - else - n = count_leading_zeros(hi); - return n; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clzll(static_cast(val)); -#endif - } - - template - struct UnsignedTraits { - typedef typename conditional::type type; - }; - - template - struct DividerTraits { - typedef typename UnsignedTraits::type type; - static const int N = sizeof(T) * 8; - }; - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) - return __umulhi(a, b); -#else - return (static_cast(a) * b) >> 32; -#endif - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) - return __umul64hi(a, b); -#elif defined(__SIZEOF_INT128__) - __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); - return static_cast(v >> 64); -#else - return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); -#endif - } - - template - struct DividerHelper { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { - EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); - return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); - } - }; - - template - struct DividerHelper<64, T> { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) - return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); -#else - const uint64_t shift = 1ULL << log_div; - TensorUInt128 result = TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - - TensorUInt128, static_val<0> >(1, 0) - + TensorUInt128, static_val<1> >(1); - return static_cast(result); -#endif - } - }; -} - - -template -struct TensorIntDivisor { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - multiplier = 0; - shift1 = 0; - shift2 = 0; - } - - // Must have 0 < divider < 2^31. This is relaxed to - // 0 < divider < 2^63 when using 64-bit indices on platforms that support - // the __uint128_t type. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { - const int N = DividerTraits::N; - eigen_assert(static_cast::type>(divider) < NumTraits::highest()/2); - eigen_assert(divider > 0); - - // fast ln2 - const int leading_zeros = count_leading_zeros(static_cast(divider)); - int log_div = N - leading_zeros; - // if divider is a power of two then log_div is 1 more than it should be. - if ((static_cast::type>(1) << (log_div-1)) == static_cast::type>(divider)) - log_div--; - - multiplier = DividerHelper::computeMultiplier(log_div, divider); - shift1 = log_div > 1 ? 1 : log_div; - shift2 = log_div > 1 ? log_div-1 : 0; - } - - // Must have 0 <= numerator. On platforms that dont support the __uint128_t - // type numerator should also be less than 2^32-1. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { - eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); - //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above - - UnsignedType t1 = muluh(multiplier, numerator); - UnsignedType t = (static_cast(numerator) - t1) >> shift1; - return (t1 + t) >> shift2; - } - - private: - typedef typename DividerTraits::type UnsignedType; - UnsignedType multiplier; - int32_t shift1; - int32_t shift2; -}; - - -// Optimized version for signed 32 bit integers. -// Derived from Hacker's Delight. -// Only works for divisors strictly greater than one -template <> -class TensorIntDivisor { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - magic = 0; - shift = 0; - } - // Must have 2 <= divider - EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { - eigen_assert(divider >= 2); - calcMagic(divider); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ - return (__umulhi(magic, n) >> shift); -#else - uint64_t v = static_cast(magic) * static_cast(n); - return (static_cast(v >> 32) >> shift); -#endif - } - -private: - // Compute the magic numbers. See Hacker's Delight section 10 for an in - // depth explanation. - EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { - const unsigned two31 = 0x80000000; // 2**31. - unsigned ad = d; - unsigned t = two31 + (ad >> 31); - unsigned anc = t - 1 - t%ad; // Absolute value of nc. - int p = 31; // Init. p. - unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. - unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). - unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. - unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). - unsigned delta = 0; - do { - p = p + 1; - q1 = 2*q1; // Update q1 = 2**p/|nc|. - r1 = 2*r1; // Update r1 = rem(2**p, |nc|). - if (r1 >= anc) { // (Must be an unsigned - q1 = q1 + 1; // comparison here). - r1 = r1 - anc;} - q2 = 2*q2; // Update q2 = 2**p/|d|. - r2 = 2*r2; // Update r2 = rem(2**p, |d|). - if (r2 >= ad) { // (Must be an unsigned - q2 = q2 + 1; // comparison here). - r2 = r2 - ad;} - delta = ad - r2; - } while (q1 < delta || (q1 == delta && r1 == 0)); - - magic = (unsigned)(q2 + 1); - shift = p - 32; - } - - uint32_t magic; - int32_t shift; -}; - - -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { - return divisor.divide(numerator); -} - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h deleted file mode 100644 index cd0109ef..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ /dev/null @@ -1,209 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H - -namespace Eigen { - -/** \class TensorLayoutSwap - * \ingroup CXX11_Tensor_Module - * - * \brief Swap the layout from col-major to row-major, or row-major - * to col-major, and invert the order of the dimensions. - * - * Beware: the dimensions are reversed by this operation. If you want to - * preserve the ordering of the dimensions, you need to combine this - * operation with a shuffle. - * - * \example: - * Tensor input(2, 4); - * Tensor output = input.swap_layout(); - * eigen_assert(output.dimension(0) == 4); - * eigen_assert(output.dimension(1) == 2); - * - * array shuffle(1, 0); - * output = input.swap_layout().shuffle(shuffle); - * eigen_assert(output.dimension(0) == 2); - * eigen_assert(output.dimension(1) == 4); - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorLayoutSwapOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorLayoutSwapOp type; -}; - -} // end namespace internal - - - -template -class TensorLayoutSwapOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorLayoutSwapOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - for(int i = 0; i < NumDims; ++i) { - m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; - } - } - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } - - const TensorEvaluator& impl() const { return m_impl; } - - protected: - TensorEvaluator m_impl; - Dimensions m_dimensions; -}; - - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorLayoutSwapOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false // to be implemented - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket(index, x); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h deleted file mode 100644 index ee0078bb..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ /dev/null @@ -1,54 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H - - -/** use this macro in sfinae selection in templated functions - * - * template::value , int >::type = 0 - * > - * void foo(){} - * - * becomes => - * - * template::value ) - * > - * void foo(){} - */ - -// SFINAE requires variadic templates -#ifndef __CUDACC__ -#if EIGEN_HAS_VARIADIC_TEMPLATES - // SFINAE doesn't work for gcc <= 4.7 - #ifdef EIGEN_COMP_GNUC - #if EIGEN_GNUC_AT_LEAST(4,8) - #define EIGEN_HAS_SFINAE - #endif - #else - #define EIGEN_HAS_SFINAE - #endif -#endif -#endif - -#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ - typename internal::enable_if< ( __condition__ ) , int >::type = 0 - - -#if EIGEN_HAS_CONSTEXPR -#define EIGEN_CONSTEXPR constexpr -#else -#define EIGEN_CONSTEXPR -#endif - - -#endif diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h deleted file mode 100644 index a8e55757..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ /dev/null @@ -1,321 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H - -namespace Eigen { - -/** \class TensorMap - * \ingroup CXX11_Tensor_Module - * - * \brief A tensor expression mapping an existing array of data. - * - */ -/// template class MakePointer_ is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler T* is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. -/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is T*. -template class MakePointer_> class TensorMap : public TensorBase > -{ - public: - typedef TensorMap Self; - typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::traits::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - /* typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType;*/ - typedef typename MakePointer_::Type PointerType; - typedef PointerType PointerArgType; - - static const int Options = Options_; - - static const Index NumIndices = PlainObjectType::NumIndices; - typedef typename PlainObjectType::Dimensions Dimensions; - - enum { - IsAligned = ((int(Options_)&Aligned)==Aligned), - Layout = PlainObjectType::Layout, - CoordAccess = true, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { - EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { - EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { - EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { - EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) - : m_data(tensor.data()), m_dimensions(tensor.dimensions()) - { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerType data() { return m_data; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - const std::size_t NumDims = sizeof...(otherIndices) + 2; - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Self& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - private: - typename MakePointer_::Type m_data; - Dimensions m_dimensions; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h deleted file mode 100644 index 615559d4..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ /dev/null @@ -1,218 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_H - -namespace Eigen { - -template struct Cond {}; - -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T1& choose(Cond, const T1& first, const T2&) { - return first; -} - -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T2& choose(Cond, const T1&, const T2& second) { - return second; -} - - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const X x, const Y y) { - return static_cast((x + y - 1) / y); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const T x, const T y) { - return static_cast((x + y - 1) / y); -} - -template struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; - - -// Default packet types -template -struct PacketType : internal::packet_traits { - typedef typename internal::packet_traits::type type; -}; - -// For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) -template <> -struct PacketType { - typedef half2 type; - static const int size = 2; - enum { - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 0, - HasMin = 1, - HasMax = 1, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0, - - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasLog = 1, - HasLog1p = 0, - HasLog10 = 0, - HasPow = 1, - }; -}; -#endif - -#if defined(EIGEN_USE_SYCL) -template - struct PacketType { - typedef T type; - static const int size = 1; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasArg = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0 - }; -}; -#endif - - -// Tuple mimics std::pair but works on e.g. nvcc. -template struct Tuple { - public: - U first; - V second; - - typedef U first_type; - typedef V second_type; - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple() : first(), second() {} - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple(const U& f, const V& s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple& operator= (const Tuple& rhs) { - if (&rhs == this) return *this; - first = rhs.first; - second = rhs.second; - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void swap(Tuple& rhs) { - using numext::swap; - swap(first, rhs.first); - swap(second, rhs.second); - } -}; - -template -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator==(const Tuple& x, const Tuple& y) { - return (x.first == y.first && x.second == y.second); -} - -template -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator!=(const Tuple& x, const Tuple& y) { - return !(x == y); -} - - -// Can't use std::pairs on cuda devices -template struct IndexPair { - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC void set(IndexPair val) { - first = val.first; - second = val.second; - } - - Idx first; - Idx second; -}; - - -#ifdef EIGEN_HAS_SFINAE -namespace internal { - - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx, numeric_list) { - return { idx[Is]... }; - } - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType&, numeric_list) { - return array(); - } - - /** Make an array (for index/dimensions) out of a custom index */ - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx) { - return customIndices2Array(idx, typename gen_numeric_list::type{}); - } - - - template - struct is_base_of - { - - typedef char (&yes)[1]; - typedef char (&no)[2]; - - template - struct Host - { - operator BB*() const; - operator DD*(); - }; - - template - static yes check(D*, T); - static no check(B*, int); - - static const bool value = sizeof(check(Host(), int())) == sizeof(yes); - }; - -} -#endif - - - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h deleted file mode 100644 index d34f1e32..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ /dev/null @@ -1,888 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H -#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H - -namespace Eigen { - -/** \class TensorReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorReshapingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorReshapingOp type; -}; - -} // end namespace internal - - - -template -class TensorReshapingOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) - : m_xpr(expr), m_dims(dims) {} - - EIGEN_DEVICE_FUNC - const NewDimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const NewDimensions m_dims; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorReshapingOp XprType; - typedef NewDimensions Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.dimensions()) - { - // The total size of the reshaped tensor must be equal to the total size - // of the input tensor. - eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } - - EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } - - protected: - TensorEvaluator m_impl; - NewDimensions m_dimensions; -}; - - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> - -{ - typedef TensorEvaluator, Device> Base; - typedef TensorReshapingOp XprType; - typedef NewDimensions Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket(index, x); - } -}; - - -/** \class TensorSlicing - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor slicing class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorSlicingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorSlicingOp type; -}; - -} // end namespace internal - - - -template -class TensorSlicingOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) - : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_indices; } - EIGEN_DEVICE_FUNC - const Sizes& sizes() const { return m_sizes; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_indices; - const Sizes m_sizes; -}; - - -// Fixme: figure out the exact threshold -namespace { -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } - - private: - Index threshold_; -}; - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_GPU -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } -}; -#endif -} - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) - { - for (std::size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const Sizes& output_dims = op.sizes(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } - } else { - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - - // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } - } - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { - Index contiguous_values = 1; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } else { - for (int i = NumDims-1; i >= 0; --i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } - // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); - if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); - for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { - Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); - } - return false; - } - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); - - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - Scalar* result = m_impl.data(); - if (result) { - Index offset = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i+1; j < NumDims; ++j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i-1; j >= 0; --j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } - return result + offset; - } - return NULL; - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[NumDims-1]); - } - return inputIndex; - } - - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Device& m_device; - Dimensions m_dimensions; - const StartIndices m_offsets; -}; - - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - const int packetSize = internal::unpacket_traits::size; - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[0]); - inputIndices[1] += (indices[1] + this->m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - this->m_impl.template writePacket(inputIndices[0], x); - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore(values, x); - this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - } -}; - - - -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorStridingSlicingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorStridingSlicingOp type; -}; - -} // end namespace internal - - -template -class TensorStridingSlicingOp : public TensorBase > -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( - const XprType& expr, const StartIndices& startIndices, - const StopIndices& stopIndices, const Strides& strides) - : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), - m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_startIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& stopIndices() const { return m_stopIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_startIndices; - const StopIndices m_stopIndices; - const Strides m_strides; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorStridingSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) - { - // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes startIndicesClamped, stopIndicesClamped; - for (size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if(m_strides[i]>0){ - startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - }else{ - /* implies m_strides[i]<0 by assert */ - startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - } - m_startIndices[i] = startIndicesClamped[i]; - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - - // check for degenerate intervals and compute output tensor shape - bool degenerate = false;; - for(int i = 0; i < NumDims; i++){ - Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ - m_dimensions[i] = 0; - degenerate = true; - }else{ - m_dimensions[i] = interval / m_strides[i] - + (interval % m_strides[i] != 0 ? 1 : 0); - eigen_assert(m_dimensions[i] >= 0); - } - } - Strides output_dims = m_dimensions; - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = m_strides[0]; - m_offsets[0] = startIndicesClamped[0]; - Index previousDimProduct = 1; - for (int i = 1; i < NumDims; ++i) { - previousDimProduct *= input_dims[i-1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); - } - } else { - m_inputStrides[NumDims-1] = m_strides[NumDims-1]; - m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; - Index previousDimProduct = 1; - for (int i = NumDims - 2; i >= 0; --i) { - previousDimProduct *= input_dims[i+1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); - } - } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - return NULL; - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i >= 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } else { - for (int i = 0; i < NumDims; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } - return inputIndex; - } - - static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { - return numext::maxi(min, numext::mini(max,value)); - } - - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Device& m_device; - DSizes m_startIndices; // clamped startIndices - DSizes m_dimensions; - DSizes m_offsets; // offset in a flattened shape - const Strides m_strides; - std::size_t m_block_total_size_max; -}; - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorStridingSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = TensorEvaluator::CoordAccess, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h deleted file mode 100644 index 647bcf10..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ /dev/null @@ -1,397 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H -#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H - -namespace Eigen { - -/** \class TensorPadding - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor padding class. - * At the moment only padding with a constant value is supported. - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorPaddingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorPaddingOp type; -}; - -} // end namespace internal - - - -template -class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) - : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - const PaddingDimensions& padding() const { return m_padding_dims; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PaddingDimensions m_padding_dims; - const Scalar m_padding_value; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorPaddingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) - { - // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead - // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector - // of 1 element first and then pad. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute dimensions - m_dimensions = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] += m_padding[i].first + m_padding[i].second; - } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; - } else { - m_inputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; - } - m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (isPaddingAtIndexForDim(index, 0)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[0].first); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i+1]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - if (isPaddingAtIndexForDim(index, NumDims-1)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[NumDims-1].first); - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } - return packetRowMajor(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - TensorOpCost cost = m_impl.costPerCoeff(vectorized); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) - updateCostPerDimension(cost, i, i == 0); - } else { - for (int i = NumDims - 1; i >= 0; --i) - updateCostPerDimension(cost, i, i == NumDims - 1); - } - return cost; - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( - Index index, int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return (!internal::index_pair_first_statically_eq(dim_index, 0) && - index < m_padding[dim_index].first) || - (!internal::index_pair_second_statically_eq(dim_index, 0) && - index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#else - return (index < m_padding[dim_index].first) || - (index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_first_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_second_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - - void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { - const double in = static_cast(m_impl.dimensions()[i]); - const double out = in + m_padding[i].first + m_padding[i].second; - if (out == 0) - return; - const double reduction = in / out; - cost *= reduction; - if (first) { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - reduction * (1 * TensorOpCost::AddCost())); - } else { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - reduction * (2 * TensorOpCost::MulCost() + - 1 * TensorOpCost::DivCost())); - } - } - - protected: - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const Index lastPaddedRight = m_outputStrides[i+1]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[0].first; - const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const Index lastPaddedRight = m_outputStrides[1]; - - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[0].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - - for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; - const Index lastPaddedRight = m_outputStrides[i]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i+1]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[NumDims-1].first; - const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); - const Index lastPaddedRight = m_outputStrides[NumDims-1]; - - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[NumDims-1].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - PaddingDimensions m_padding; - - Scalar m_paddingValue; -}; - - - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h deleted file mode 100644 index 886a254f..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ /dev/null @@ -1,269 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H - -namespace Eigen { - -/** \class TensorPatch - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor patch class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorPatchOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorPatchOp type; -}; - -} // end namespace internal - - - -template -class TensorPatchOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) - : m_xpr(expr), m_patch_dims(patch_dims) {} - - EIGEN_DEVICE_FUNC - const PatchDim& patch_dims() const { return m_patch_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PatchDim m_patch_dims; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorPatchOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value + 1; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - Index num_patches = 1; - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const PatchDim& patch_dims = op.patch_dims(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[NumDims-1] = num_patches; - - m_inputStrides[0] = 1; - m_patchStrides[0] = 1; - for (int i = 1; i < NumDims-1; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); - } - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i+1] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[0] = num_patches; - - m_inputStrides[NumDims-2] = 1; - m_patchStrides[NumDims-2] = 1; - for (int i = NumDims-3; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); - } - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - // Find the location of the first element of the patch. - Index patchIndex = index / m_outputStrides[output_stride_index]; - // Find the offset of the element wrt the location of the first element. - Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i]; - patchOffset -= offsetIdx * m_outputStrides[i]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i+1]; - patchOffset -= offsetIdx * m_outputStrides[i+1]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } - inputIndex += (patchIndex + patchOffset); - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + PacketSize - 1}; - Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], - indices[1] / m_outputStrides[output_stride_index]}; - Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], - indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; - - Index inputIndices[2] = {0, 0}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], - patchOffsets[1] / m_outputStrides[i]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], - patchOffsets[1] / m_outputStrides[i+1]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } - inputIndices[0] += (patchIndices[0] + patchOffsets[0]); - inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - - if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[PacketSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < PacketSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (TensorOpCost::DivCost() + - TensorOpCost::MulCost() + - 2 * TensorOpCost::AddCost()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - array m_patchStrides; - - TensorEvaluator m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h deleted file mode 100644 index 1655a813..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ /dev/null @@ -1,276 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H -#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H - -namespace Eigen { -namespace internal { - -namespace { - -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ - // We don't support 3d kernels since we currently only use 1 and - // 2d kernels. - assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - -#else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; -#endif -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { - // TODO: Unify with the implementation in the non blocking thread pool. - uint64_t current = *state; - // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; - // Generate the random output (using the PCG-XSH-RS scheme) - return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { - seed = seed ? seed : get_random_seed(); - return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; -} - -} // namespace - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); - return static_cast(rnd); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { - Eigen::half result; - // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); - result.x = static_cast(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast(15) << 10); - // Return the final result - return result - Eigen::half(1.0f); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { - typedef union { - uint32_t raw; - float fp; - } internal; - internal result; - // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); - result.raw = rnd & 0x7fffffu; - // Set the exponent - result.raw |= (static_cast(127) << 23); - // Return the final result - return result.fp - 1.0f; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { - typedef union { - uint64_t raw; - double dp; - } internal; - internal result; - result.raw = 0; - // Generate 52 random bits for the mantissa - // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; - // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); - result.raw = (static_cast(rnd1) << 32) | rnd2; - // Set the exponent - result.raw |= (static_cast(1023) << 52); - // Return the final result - return result.dp - 1.0; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} - -template class UniformRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - const UniformRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - -template -struct functor_traits > { - enum { - // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). - Cost = 12 * NumTraits::AddCost * - ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), - PacketAccess = UniformRandomGenerator::PacketAccess - }; -}; - - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { - // Use the ratio of uniform method to generate numbers following a normal - // distribution. See for example Numerical Recipes chapter 7.3.9 for the - // details. - T u, v, q; - do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); - const T x = u - T(0.449871); - const T y = numext::abs(v) + T(0.386595); - q = x*x + y * (T(0.196)*y - T(0.25472)*x); - } while (q > T(0.27597) && - (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); - - return v/u; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} - - -template class NormalRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( - const NormalRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - - -template -struct functor_traits > { - enum { - // On average, we need to generate about 3 random numbers - // 15 mul, 8 add, 1.5 logs - Cost = 3 * functor_traits >::Cost + - 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + - 3 * functor_traits >::Cost / 2, - PacketAccess = NormalRandomGenerator::PacketAccess - }; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h deleted file mode 100644 index 41d0d002..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ /dev/null @@ -1,781 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H - -namespace Eigen { - -/** \class TensorReduction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reduction class. - * - */ - -namespace internal { - template class MakePointer_ > - struct traits > - : traits -{ - typedef traits XprTraits; - typedef typename XprTraits::Scalar Scalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size::value; - static const int Layout = XprTraits::Layout; - - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorReductionOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorReductionOp type; -}; - - -template struct DimInitializer { - template EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, - const array::value>& reduced, - OutputDims* output_dims, ReducedDims* reduced_dims) { - const int NumInputDims = internal::array_size::value; - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { - (*reduced_dims)[reduceIndex] = input_dims[i]; - ++reduceIndex; - } else { - (*output_dims)[outputIndex] = input_dims[i]; - ++outputIndex; - } - } - } -}; - -template <> struct DimInitializer > { - template EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, const array&, - Sizes<>*, array* reduced_dims) { - const int NumInputDims = internal::array_size::value; - for (int i = 0; i < NumInputDims; ++i) { - (*reduced_dims)[i] = input_dims[i]; - } - } -}; - - -template -struct are_inner_most_dims { - static const bool value = false; -}; -template -struct preserve_inner_most_dims { - static const bool value = false; -}; - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES -template -struct are_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_eq(0, 0); - static const bool tmp3 = index_statically_eq(array_size::value-1, array_size::value-1); - static const bool value = tmp1 & tmp2 & tmp3; -}; -template -struct are_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); - static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2 & tmp3; - -}; -template -struct preserve_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_gt(0, 0); - static const bool value = tmp1 & tmp2; - -}; -template -struct preserve_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2; -}; -#endif - - -template -struct GenericDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - GenericDimReducer::reduce(self, input, reducer, accum); - } - } -}; -template -struct GenericDimReducer<0, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - for (int j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reduce(self.m_impl.coeff(input), accum); - } - } -}; -template -struct GenericDimReducer<-1, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { - reducer.reduce(self.m_impl.coeff(index), accum); - } -}; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalize(accum); - } -}; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const int packetSize = internal::unpacket_traits::size; - const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType p = reducer.template initializePacket(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); - } - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, p); - } -}; - -template -struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -template -struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - InnerMostDimPreserver::reduce(self, input, reducer, accum); - } - } -}; - -template -struct InnerMostDimPreserver<0, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reducePacket(self.m_impl.template packet(input), accum); - } - } -}; -template -struct InnerMostDimPreserver<-1, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -// Default full reducer -template -struct FullReducer { - static const bool HasOptimizedImplementation = false; - - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { - const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - } -}; - - -#ifdef EIGEN_USE_THREADS -// Multithreaded full reducers -template -struct FullReducerShard { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer, - typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer::reduce( - self, firstIndex, numValuesToReduce, reducer); - } -}; - -// Multithreaded full reducer -template -struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const TensorOpCost cost = - self.m_impl.costPerCoeff(Vectorizable) + - TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, - PacketSize); - const int num_threads = TensorCostModel::numThreads( - num_coeffs, cost, device.numThreads()); - if (num_threads == 1) { - *output = - InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } - const Index blocksize = - std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); - - Barrier barrier(internal::convert_index(numblocks)); - MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, - self, i * blocksize, blocksize, reducer, - &shards[i]); - } - typename Self::CoeffReturnType finalShard; - if (numblocks * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, - reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - - for (Index i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } -}; - -#endif - - -// Default inner reducer -template -struct InnerReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - -// Default outer reducer -template -struct OuterReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); - - -#ifdef EIGEN_HAS_CUDA_FP16 -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); - -#endif - -template -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - -template -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#endif - -} // end namespace internal - - -template class MakePointer_> -class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) - { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const XprType& expression() const { return m_expr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dims& dims() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Op& reducer() const { return m_reducer; } - - protected: - typename XprType::Nested m_expr; - const Dims m_dims; - const Op m_reducer; -}; - - -// Eval as rvalue -template class MakePointer_, typename Device> -struct TensorEvaluator, Device> -{ - typedef TensorReductionOp XprType; - typedef typename XprType::Index Index; - typedef ArgType ChildType; - typedef typename TensorEvaluator::Dimensions InputDimensions; - static const int NumInputDims = internal::array_size::value; - static const int NumReducedDims = internal::array_size::value; - static const int NumOutputDims = NumInputDims - NumReducedDims; - typedef typename internal::conditional, DSizes >::type Dimensions; - typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator, Device> Self; - static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; - static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; - static const bool RunningFullReduction = (NumOutputDims==0); - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) - { - EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Build the bitmap indicating if an input dimension is reduced or not. - for (int i = 0; i < NumInputDims; ++i) { - m_reduced[i] = false; - } - for (int i = 0; i < NumReducedDims; ++i) { - eigen_assert(op.dims()[i] >= 0); - eigen_assert(op.dims()[i] < NumInputDims); - m_reduced[op.dims()[i]] = true; - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); - - // Precompute output strides. - if (NumOutputDims > 0) { - if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_outputStrides.back() = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } - } - } - - // Precompute input strides. - if (NumInputDims > 0) { - array input_strides; - if (static_cast(Layout) == static_cast(ColMajor)) { - input_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_strides[i] = input_strides[i-1] * input_dims[i-1]; - } - } else { - input_strides.back() = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; - } - } - - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - m_reducedStrides[reduceIndex] = input_strides[i]; - ++reduceIndex; - } else { - m_preservedStrides[outputIndex] = input_strides[i]; - ++outputIndex; - } - } - } - - // Special case for full reductions - if (NumOutputDims == 0) { - m_preservedStrides[0] = internal::array_prod(input_dims); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { - m_impl.evalSubExprsIfNeeded(NULL); - - // Use the FullReducer if possible. - if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && - internal::FullReducer::HasOptimizedImplementation && - ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - !RunningOnGPU))) { - bool need_assign = false; - if (!data) { - m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); - data = m_result; - need_assign = true; - } - Op reducer(m_reducer); - internal::FullReducer::run(*this, reducer, m_device, data); - return need_assign; - } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } - - // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { - bool reducing_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast(Layout) == static_cast(ColMajor)) { - reducing_inner_dims &= m_reduced[i]; - } else { - reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } - } - if (internal::InnerReducer::HasOptimizedImplementation && - (reducing_inner_dims || ReducingInnerMostDims)) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - - bool preserving_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast(Layout) == static_cast(ColMajor)) { - preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } else { - preserving_inner_dims &= m_reduced[i]; - } - } - if (internal::OuterReducer::HasOptimizedImplementation && - preserving_inner_dims) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { - return *(m_result + index); - } - Op reducer(m_reducer); - if (ReducingInnerMostDims || RunningFullReduction) { - const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - return internal::InnerMostDimReducer::reduce(*this, firstInput(index), - num_values_to_reduce, reducer); - } else { - typename Self::CoeffReturnType accum = reducer.initialize(); - internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); - return reducer.finalize(accum); - } - } - - // TODO(bsteiner): provide a more efficient implementation. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); - - if (RunningOnGPU && m_result) { - return internal::pload(m_result + index); - } - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - if (ReducingInnerMostDims) { - const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - const Index firstIndex = firstInput(index); - for (Index i = 0; i < PacketSize; ++i) { - Op reducer(m_reducer); - values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, - num_values_to_reduce, reducer); - } - } else if (PreservingInnerMostDims) { - const Index firstIndex = firstInput(index); - const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; - // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { - Op reducer(m_reducer); - typename Self::PacketReturnType accum = reducer.template initializePacket(); - internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); - return reducer.finalizePacket(accum); - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - // Must be called after evalSubExprsIfNeeded(). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - if (RunningFullReduction && m_result) { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } else { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; - return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - } - - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} - - - private: - template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; - template friend struct internal::InnerMostDimPreserver; - template friend struct internal::FullReducer; -#ifdef EIGEN_USE_THREADS - template friend struct internal::FullReducerShard; -#endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); -#endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#endif - - template friend struct internal::InnerReducer; - - // Returns the Index in the input tensor of the first value that needs to be - // used to compute the reduction at output index "index". - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - if (ReducingInnerMostDims) { - if (static_cast(Layout) == static_cast(ColMajor)) { - return index * m_preservedStrides[0]; - } else { - return index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - // TBD: optimize the case where we preserve the innermost dimensions. - Index startInput = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumOutputDims - 1; i > 0; --i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[0] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[0]; - } - } else { - for (int i = 0; i < NumOutputDims - 1; ++i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - return startInput; - } - - // Bitmap indicating if an input dimension is reduced or not. - array m_reduced; - // Dimensions of the output of the operation. - Dimensions m_dimensions; - // Precomputed strides for the output tensor. - array m_outputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; - array m_preservedStrides; - - // Subset of strides of the input tensor for the reduced dimensions. - // Indexed by reduced dimensions. - array m_reducedStrides; - // Size of the input dimensions that are reduced. - // Indexed by reduced dimensions. - array m_reducedDims; - - // Evaluator for the input expression. - TensorEvaluator m_impl; - - // Operation to apply for computing the reduction. - Op m_reducer; - - // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - static const bool RunningOnGPU = internal::is_same::value; - static const bool RunningOnSycl = false; -#elif defined(EIGEN_USE_SYCL) -static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; -static const bool RunningOnGPU = false; -#else - static const bool RunningOnGPU = false; - static const bool RunningOnSycl = false; -#endif - typename MakePointer_::Type m_result; - - const Device& m_device; - const Dims& m_xpr_dims; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h deleted file mode 100644 index 65638b6a..00000000 --- a/externals/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ /dev/null @@ -1,750 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple cuda thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread -// updated the content of the output address it will try again. -template -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - assert(0 && "Wordsize not supported"); - } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif -} - -// We extend atomicExch to support extra data types -template -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#ifdef EIGEN_HAS_CUDA_FP16 -template