Merge pull request #448 from ValeevGroup/evaleev/feature/mkl-fair-dis…

…patch allows to use fair dispatch in Intel MKL
ValeevGroup · Apr 2, 2024 · 138af30 · 138af30
2 parents 0cc5e31 + 0dbd0ee
commit 138af30
Show file tree

Hide file tree

Showing 10 changed files with 266 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -175,6 +175,9 @@ add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library")
 option(TA_TTG "Enable search/build of TTG library" OFF)
 add_feature_info(TA_TTG TA_TTG "TTG library")
 
+option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF)
+add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL")
+
 # Enable shared library support options
 redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF)
 add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED

diff --git a/INSTALL.md b/INSTALL.md
@@ -423,6 +423,7 @@ support may be added.
 * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
 * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray.
 * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.
+* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF].
 
 # Build TiledArray
 

diff --git a/examples/dgemm/ta_blas.cpp b/examples/dgemm/ta_blas.cpp
@@ -69,7 +69,7 @@ int main(int argc, char** argv) {
   // Start clock
   const double wall_time_start = madness::wall_time();
 
-  // Do matrix multiplcation
+  // Do matrix multiplication
   // Note: If TiledArray has not been configured with blas, this will be an
   // eigen call.
   for (int i = 0; i < repeat; ++i) {

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -313,6 +313,16 @@ if( TARGET ttg-parsec )
   list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec)
 endif()
 
+if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL)
+    message(WARNING "created tiledarray_mkl_dispatch")
+    add_library(tiledarray_mkl_dispatch OBJECT
+      TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
+      TiledArray/external/agnerfog/intel_mkl_feature_patch.c
+    )
+    # N.B. --allow-multiple-definition is a GNU linker extension
+    list(APPEND _TILEDARRAY_DEPENDENCIES $<TARGET_OBJECTS:tiledarray_mkl_dispatch> -Wl,--allow-multiple-definition)
+endif()
+
 # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES
 set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on")
 

diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
@@ -113,6 +113,8 @@
 #endif  // !defined(TILEDARRAY_HAS_BTAS)
 #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL)
 #  define TILEDARRAY_HAS_INTEL_MKL
+/* use fair dispatch in Intel MKL? */
+#cmakedefine IntelMKL_FAIR_DISPATCH
 #endif
 
 /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */

diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
@@ -0,0 +1,48 @@
+/***********************  intel_cpu_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel compiler version 13.0 and later, including the general
+ * libraries, LIBM and SVML, but not MKL and VML.
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of generated code with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_cpu_patch();
+ * before any call to the library functions.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to Intel libraries
+extern int64_t __intel_cpu_feature_indicator;    // CPU feature bits
+extern int64_t __intel_cpu_feature_indicator_x;  // CPU feature bits
+void __intel_cpu_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_cpu_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_cpu_patch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_cpu_feature_indicator = 0;
+  __intel_cpu_feature_indicator_x = 0;
+  __intel_cpu_features_init_x();
+  __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
@@ -0,0 +1,61 @@
+/***********************  intel_mkl_cpuid_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to override Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ *
+ * Include this code in your C or C++ program and make sure it is linked before
+ * any Intel libraries. You may need to include intel_mkl_feature_patch.c as
+ *well.
+ *
+ * Copyright (c) 2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu() { return 1; }
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu_true() { return 1; }
+
+int mkl_serv_cpuhaspnr_true() { return 1; }
+
+int mkl_serv_cpuhaspnr() { return 1; }
+
+int mkl_serv_cpuhasnhm() { return 1; }
+
+int mkl_serv_cpuisbulldozer() { return 0; }
+
+int mkl_serv_cpuiszen() { return 0; }
+
+int mkl_serv_cpuisatomsse4_2() { return 0; }
+
+int mkl_serv_cpuisatomssse3() { return 0; }
+
+int mkl_serv_cpuisitbarcelona() { return 0; }
+
+int mkl_serv_cpuisskl() { return 0; }
+
+int mkl_serv_cpuisknm() { return 0; }
+
+int mkl_serv_cpuisclx() { return 0; }
+
+int mkl_serv_get_microarchitecture() {
+  // I don't know what this number means
+  return 33;
+}
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
@@ -0,0 +1,49 @@
+/***********************  intel_mkl_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_mkl_patch();
+ * before any call to the MKL functions. You may need to include
+ * intel_mkl_cpuid_patch.c as well.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to MKL libraries
+extern int64_t __intel_mkl_feature_indicator;    // CPU feature bits
+extern int64_t __intel_mkl_feature_indicator_x;  // CPU feature bits
+void __intel_mkl_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_mkl_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_mkl_use_fair_dispatch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_mkl_feature_indicator = 0;
+  __intel_mkl_feature_indicator_x = 0;
+  __intel_mkl_features_init_x();
+  __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt
@@ -0,0 +1,84 @@
+               intel_dispatch_patch.zip
+               ========================
+
+By Agner Fog, Technical University of Denmark, 2019.
+
+Intel's compilers are generating code that will run slower than necessary when
+the code is executed on a CPU that is not produced by Intel. This has been
+observed with Intel C, C++, and Fortran compilers.
+
+The same happens when certain function libraries produced by Intel are used,
+even if the code is compiled with another compiler, such as Microsoft, Gnu
+or Clang compilers.
+
+This problem is affecting several commonly used software programs such as 
+Matlab, because they are using Intel software libraries.
+
+The library code and the code generated by an Intel compiler may contain
+multiple versions, each optimized for a particular instruction set extension.
+A so-called CPU dispatcher is chosing the optimal version of the code at
+runtime, based on which CPU it is running on.
+
+CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the
+optimal code based only on which instruction set extensions are supported
+by the CPU. An unfair dispatcher first checks the CPU brand. If the brand
+is not Intel, then the unfair dispatcher will chose the "generic" version 
+of the code, i.e. the slowest version that is compatible with old CPUs 
+without the relevant instruction set extensions.
+
+The CPU dispatchers in many Intel function libraries have two versions, a 
+fair and an unfair one. It is not clear when the fair dispatcher is used
+and when the unfair dispatcher is used. My observations about fair and
+unfair CPU dispatching are as follows:
+
+* Code compiled with an Intel compiler will usually have unfair CPU dispatching.
+
+* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives)
+  function libraries from Intel are using the fair CPU dispatcher when used 
+  with a non-Intel compiler.
+
+* The MKL (Math Kernel Library) library contains both fair and unfair
+  dispatchers. It is not clear which dispatcher is used on each function.
+
+The code examples contained herein may be used for circumventing unfair CPU
+dispatching in order to improve compatibility with non-Intel CPUs.
+
+The following files are contained:
+
+intel_cpu_feature_patch.c
+-------------------------
+This code makes sure the fair dispatcher is called instead of the unfair
+one for code generated with an Intel compiler and for general Intel
+function libraries.
+
+intel_mkl_feature_patch.c
+-------------------------
+This does the same for the Intel MKL library.
+
+intel_mkl_cpuid_patch.c
+-----------------------
+This code example is overriding CPU detection functions in Intel's MKL 
+function library. The mkl_serv_intel_cpu() function in MKL is returning
+1 when running on an Intel CPU and 0 when running on any other brand of
+CPU. You may include this code to replace this function in MKL with a
+function that returns 1 regardless of CPU brand.
+
+It may be necessary to use both intel_mkl_feature_patch.c and 
+intel_mkl_cpuid_patch.c when using the MKL library in software that
+may run on any brand of CPU.
+
+An alternative method is to set the environment variable
+   MKL_DEBUG_CPU_TYPE=5
+when running on an AMD processor. This may be useful when you do not have
+access to the source code, for example when running Matlab software.
+
+The patches provided here are based on undocumented features in Intel
+function libraries. Use them at your own risk, and make sure to test your
+code properly to make sure it works as intended.
+
+The most reliable solution is, of course, to avoid Intel compilers and 
+Intel function libraries in code that may run on other CPU brands such
+as AMD and VIA. You may find other function libraries on the web, or 
+you may make your own functions. My vector class library (VCL) is useful
+for making mathematical functions that process multiple data in parallel,
+using the vector processing features of modern CPUs.
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
@@ -16,6 +16,10 @@
 #include <ttg.h>
 #endif
 
+#ifdef IntelMKL_FAIR_DISPATCH
+extern "C" void intel_mkl_use_fair_dispatch();
+#endif
+
 #include <cerrno>
 #include <csignal>
 #include <cstdlib>
@@ -100,6 +104,9 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
     TiledArray::set_default_world(default_world);
 #ifdef TILEDARRAY_HAS_DEVICE
     TiledArray::device_initialize();
+#endif
+#ifdef IntelMKL_FAIR_DISPATCH
+    intel_mkl_use_fair_dispatch();
 #endif
     TiledArray::max_threads = TiledArray::get_num_threads();
     TiledArray::set_num_threads(1);