From 92c6981861f955d7d5cbe2e07d23693e3d8a1bbe Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 09:19:23 -0500
Subject: [PATCH 01/20] [unit] fix range1_suite/constructors + add corner case
 to range1_suite/accessors

---
 tests/range1.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/range1.cpp b/tests/range1.cpp
index bc2fabdd6c..ba49515cd7 100644
--- a/tests/range1.cpp
+++ b/tests/range1.cpp
@@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(constructors) {
   BOOST_CHECK_NO_THROW((Range1{1, 1}));
   BOOST_CHECK_NO_THROW(Range1(1, 1));
   BOOST_CHECK_EQUAL(Range1(1, 1).first, 1);
-  BOOST_CHECK_EQUAL(Range1(1, 1).first, 1);
+  BOOST_CHECK_EQUAL(Range1(1, 1).second, 1);
 
   BOOST_CHECK_NO_THROW((Range1{-11, 13}));
   BOOST_CHECK_EQUAL(Range1(-11, 13).first, -11);
@@ -86,6 +86,15 @@ BOOST_AUTO_TEST_CASE(accessors) {
   BOOST_CHECK_EQUAL(r.upbound(), 10);
   BOOST_CHECK_NO_THROW(r.extent());
   BOOST_CHECK_EQUAL(r.extent(), 9);
+
+  // corner case: empty range
+  Range1 r1{1, 1};
+  BOOST_CHECK_NO_THROW(r1.lobound());
+  BOOST_CHECK_EQUAL(r1.lobound(), 1);
+  BOOST_CHECK_NO_THROW(r1.upbound());
+  BOOST_CHECK_EQUAL(r1.upbound(), 1);
+  BOOST_CHECK_NO_THROW(r.extent());
+  BOOST_CHECK_EQUAL(r1.extent(), 0);
 }
 
 BOOST_AUTO_TEST_CASE(iteration) {
@@ -134,13 +143,13 @@ BOOST_AUTO_TEST_CASE(serialization) {
   std::size_t buf_size = sizeof(Range1);
   unsigned char* buf = new unsigned char[buf_size];
   madness::archive::BufferOutputArchive oar(buf, buf_size);
-  oar& r;
+  oar & r;
   std::size_t nbyte = oar.size();
   oar.close();
 
   Range1 rs;
   madness::archive::BufferInputArchive iar(buf, nbyte);
-  iar& rs;
+  iar & rs;
   iar.close();
 
   delete[] buf;

From f82330fb1a713fa9ac45e5451d4d3badca28d824 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 09:57:25 -0500
Subject: [PATCH 02/20] TiledRange1 supports empty tiles

---
 src/TiledArray/tiled_range1.h | 20 +++++++++++-------
 tests/tiled_range1.cpp        | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h
index f1dc2369a8..69e5a5eea3 100644
--- a/src/TiledArray/tiled_range1.h
+++ b/src/TiledArray/tiled_range1.h
@@ -38,9 +38,11 @@ namespace TiledArray {
 
 /// TiledRange1 class defines a non-uniformly-tiled, contiguous, one-dimensional
 /// range. The tiling data is constructed with and stored in an array with
-/// the format {a0, a1, a2, ...}, where 0 <= a0 < a1 < a2 < ... Each tile is
+/// the format {a0, a1, a2, ...}, where a0 <= a1 <= a2 <= ... Each tile is
 /// defined as [a0,a1), [a1,a2), ... The number of tiles in the range will be
 /// equal to one less than the number of elements in the array.
+/// \note if TiledArray was configured with `TA_SIGNED_1INDEX_TYPE=OFF` then the
+/// tile boundaries must be non-negative.
 class TiledRange1 {
  private:
   struct Enabler {};
@@ -230,6 +232,7 @@ class TiledRange1 {
     if (!elem2tile_) {
       init_elem2tile_();
     }
+    // N.B. only track elements in this range
     return elem2tile_[i - elements_range_.first];
   }
 
@@ -312,17 +315,17 @@ class TiledRange1 {
     TA_ASSERT((std::distance(first, last) >= 2) &&
               "TiledRange1 construction failed: You need at least 2 "
               "elements in the tile boundary list.");
-    // Verify the requirement that a0 < a1 < a2 < ...
+    // Verify the requirement that a0 <= a1 <= a2 <= ...
     for (; first != (last - 1); ++first) {
       TA_ASSERT(
-          *first < *(first + 1) &&
+          *first <= *(first + 1) &&
           "TiledRange1 construction failed: Invalid tile boundary, tile "
-          "boundary i must be greater than tile boundary i+1 for all i. ");
+          "boundary i must not be greater than tile boundary i+1 for all i. ");
       TA_ASSERT(
-          static_cast<index1_type>(*first) <
+          static_cast<index1_type>(*first) <=
               static_cast<index1_type>(*(first + 1)) &&
           "TiledRange1 construction failed: Invalid tile boundary, tile "
-          "boundary i must be greater than tile boundary i+1 for all i. ");
+          "boundary i must not be greater than tile boundary i+1 for all i. ");
     }
   }
 
@@ -362,9 +365,10 @@ class TiledRange1 {
         // #endif
         const auto end = extent(range_);
         for (index1_type t = 0; t < end; ++t)
-          for (index1_type e = tiles_ranges_[t].first;
-               e < tiles_ranges_[t].second; ++e)
+          for (auto e : tiles_ranges_[t]) {
+            // only track elements in this range
             e2t[e - elements_range_.first] = t + range_.first;
+          }
         auto e2t_const = std::const_pointer_cast<const index1_type[]>(e2t);
         // commit the changes
         std::swap(elem2tile_, e2t_const);
diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp
index 043e4b96ac..eb94091e59 100644
--- a/tests/tiled_range1.cpp
+++ b/tests/tiled_range1.cpp
@@ -152,6 +152,32 @@ BOOST_AUTO_TEST_CASE(constructor) {
     }
   }
 
+  // corner cases
+  {
+    // range with 1 empty tile
+    {
+      TiledRange1 r{0, 0};
+      BOOST_CHECK_EQUAL(r.tiles_range().first, 0);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, 1);
+      BOOST_CHECK_EQUAL(r.elements_range().first, 0);
+      BOOST_CHECK_EQUAL(r.elements_range().second, 0);
+      BOOST_CHECK(r.tile(0) == Range1(0, 0));
+    }
+    // range with some empty tiles
+    {
+      TiledRange1 r{1, 3, 3, 5, 5};
+      BOOST_CHECK_EQUAL(r.tiles_range().first, 0);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, 4);
+      BOOST_CHECK_EQUAL(r.elements_range().first, 1);
+      BOOST_CHECK_EQUAL(r.elements_range().second, 5);
+      // test tiles
+      BOOST_CHECK(r.tile(0) == Range1(1, 3));
+      BOOST_CHECK(r.tile(1) == Range1(3, 3));
+      BOOST_CHECK(r.tile(2) == Range1(3, 5));
+      BOOST_CHECK(r.tile(3) == Range1(5, 5));
+    }
+  }
+
   // Check that invalid input throws an exception.
 #ifndef NDEBUG
   {
@@ -195,6 +221,20 @@ BOOST_AUTO_TEST_CASE(element_to_tile) {
 
   // Check that the expected and internal element to tile maps match.
   BOOST_CHECK_EQUAL_COLLECTIONS(c.begin(), c.end(), e.begin(), e.end());
+
+  // corner case: empty tiles
+  {
+    // range with some empty tiles
+    {
+      TiledRange1 r{1, 3, 3, 5, 5};
+      BOOST_CHECK_TA_ASSERT(r.element_to_tile(0), Exception);
+      BOOST_CHECK_EQUAL(r.element_to_tile(1), 0);
+      BOOST_CHECK_EQUAL(r.element_to_tile(2), 0);
+      BOOST_CHECK_EQUAL(r.element_to_tile(3), 2);
+      BOOST_CHECK_EQUAL(r.element_to_tile(4), 2);
+      BOOST_CHECK_TA_ASSERT(r.element_to_tile(5), Exception);
+    }
+  }
 }
 
 BOOST_AUTO_TEST_CASE(comparison) {

From 03b67de05e2062d1fa80ac0a5012899306f29e4a Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 10:03:55 -0500
Subject: [PATCH 03/20] [unit] tiled_range_suite/constructor: test corner case
 with TiledRange1 composed of empty tiles only

---
 tests/tiled_range.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp
index 14e47e3557..76702831a3 100644
--- a/tests/tiled_range.cpp
+++ b/tests/tiled_range.cpp
@@ -58,6 +58,14 @@ BOOST_AUTO_TEST_CASE(constructor) {
     BOOST_CHECK_EQUAL(r1.elements_range().area(), 0);
   }
 
+  // construct with ranges containing empty tiles only
+  {
+    BOOST_REQUIRE_NO_THROW(TiledRange r1({dims[0], TiledRange1{1, 1, 1}}));
+    TiledRange r1{dims[0], TiledRange1{1, 1, 1}};
+    BOOST_CHECK_EQUAL(r1.tiles_range().area(), dims[0].tile_extent() * 2);
+    BOOST_CHECK_EQUAL(r1.elements_range().area(), 0);
+  }
+
   // check initializer list of initializer list constructor
   {
     TiledRange r1{

From c3f8b80b7d5c44b3a0b17915c2d8257177fcdd19 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 10:05:13 -0500
Subject: [PATCH 04/20] dox typo

---
 src/TiledArray/expressions/cont_engine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 2a658dc886..8795e699c6 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -350,7 +350,7 @@ class ContEngine : public BinaryEngine<Derived> {
     left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
     right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
 
-    // Initialize the process map in not already defined
+    // Initialize the process map if not already defined
     if (!pmap) pmap = proc_grid_.make_pmap();
     ExprEngine_::init_distribution(world, pmap);
   }

From a586733320f9ad216ef67cbe9b913153df5683df Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 14:06:58 -0500
Subject: [PATCH 05/20] SparseShape::sparsity() returns 0 for zero-volume
 ranges

---
 src/TiledArray/sparse_shape.h |  19 +--
 tests/sparse_shape.cpp        | 222 +++++++++++++++++++++++-----------
 2 files changed, 161 insertions(+), 80 deletions(-)

diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h
index b589dc73cf..f7cc9355f7 100644
--- a/src/TiledArray/sparse_shape.h
+++ b/src/TiledArray/sparse_shape.h
@@ -516,10 +516,13 @@ class SparseShape {
 
   /// Sparsity of the shape
 
-  /// \return The fraction of tiles that are zero.
+  /// \return The fraction of tiles that are zero. Always returns 0 if
+  /// `this->data().size()` is zero.
   float sparsity() const {
     TA_ASSERT(!tile_norms_.empty());
-    return float(zero_tile_count_) / float(tile_norms_.size());
+    return tile_norms_.size() != 0
+               ? float(zero_tile_count_) / float(tile_norms_.size())
+               : 0.f;
   }
 
   // clang-format off
@@ -1679,23 +1682,23 @@ class SparseShape {
             typename std::enable_if<madness::is_input_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) {
-    ar& tile_norms_;
+    ar & tile_norms_;
     const unsigned int dim = tile_norms_.range().rank();
     // allocate size_vectors_
     size_vectors_ = std::move(std::shared_ptr<vector_type>(
         new vector_type[dim], std::default_delete<vector_type[]>()));
-    for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d];
-    ar& zero_tile_count_;
+    for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d];
+    ar & zero_tile_count_;
   }
 
   template <typename Archive,
             typename std::enable_if<madness::is_output_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) const {
-    ar& tile_norms_;
+    ar & tile_norms_;
     const unsigned int dim = tile_norms_.range().rank();
-    for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d];
-    ar& zero_tile_count_;
+    for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d];
+    ar & zero_tile_count_;
   }
 
  private:
diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp
index 0112f0dac6..64116dd687 100644
--- a/tests/sparse_shape.cpp
+++ b/tests/sparse_shape.cpp
@@ -121,9 +121,12 @@ BOOST_AUTO_TEST_CASE(non_comm_constructor) {
     }
   }
 
-  BOOST_CHECK_CLOSE(x.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      x.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK(x.nnz() == x.data().size() - zero_tile_count);
 
   // use the sparse ctor
@@ -194,9 +197,12 @@ BOOST_AUTO_TEST_CASE(comm_constructor) {
     }
   }
 
-  BOOST_CHECK_CLOSE(x.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      x.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK_EQUAL(x.nnz(), x.data().size() - zero_tile_count);
 
   // use the sparse ctor
@@ -321,7 +327,9 @@ BOOST_AUTO_TEST_CASE(block) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -413,7 +421,9 @@ BOOST_AUTO_TEST_CASE(block_scale) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -513,7 +523,9 @@ BOOST_AUTO_TEST_CASE(block_perm) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -614,7 +626,9 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -706,9 +720,12 @@ BOOST_AUTO_TEST_CASE(transform) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mask) {
@@ -745,9 +762,12 @@ BOOST_AUTO_TEST_CASE(mask) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(scale) {
@@ -778,9 +798,12 @@ BOOST_AUTO_TEST_CASE(scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(scale_perm) {
@@ -812,9 +835,12 @@ BOOST_AUTO_TEST_CASE(scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add) {
@@ -848,9 +874,12 @@ BOOST_AUTO_TEST_CASE(add) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK_EQUAL(result.nnz(), result.data().size() - zero_tile_count);
 }
 
@@ -885,9 +914,12 @@ BOOST_AUTO_TEST_CASE(add_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_perm) {
@@ -922,9 +954,12 @@ BOOST_AUTO_TEST_CASE(add_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_scale_perm) {
@@ -959,9 +994,12 @@ BOOST_AUTO_TEST_CASE(add_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_const) {
@@ -998,9 +1036,12 @@ BOOST_AUTO_TEST_CASE(add_const) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_const_perm) {
@@ -1037,9 +1078,12 @@ BOOST_AUTO_TEST_CASE(add_const_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt) {
@@ -1073,9 +1117,12 @@ BOOST_AUTO_TEST_CASE(subt) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_scale) {
@@ -1109,9 +1156,12 @@ BOOST_AUTO_TEST_CASE(subt_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_perm) {
@@ -1146,9 +1196,12 @@ BOOST_AUTO_TEST_CASE(subt_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_scale_perm) {
@@ -1183,9 +1236,12 @@ BOOST_AUTO_TEST_CASE(subt_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_const) {
@@ -1220,9 +1276,12 @@ BOOST_AUTO_TEST_CASE(subt_const) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_const_perm) {
@@ -1260,9 +1319,12 @@ BOOST_AUTO_TEST_CASE(subt_const_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult) {
@@ -1295,9 +1357,12 @@ BOOST_AUTO_TEST_CASE(mult) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_scale) {
@@ -1330,9 +1395,12 @@ BOOST_AUTO_TEST_CASE(mult_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_perm) {
@@ -1368,9 +1436,12 @@ BOOST_AUTO_TEST_CASE(mult_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_scale_perm) {
@@ -1406,9 +1477,12 @@ BOOST_AUTO_TEST_CASE(mult_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(gemm) {
@@ -1470,7 +1544,9 @@ BOOST_AUTO_TEST_CASE(gemm) {
   }
 
   BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(result_norms.size()),
+                    result_norms.size() > 0
+                        ? float(zero_tile_count) / float(result_norms.size())
+                        : 0,
                     tolerance);
 }
 
@@ -1538,7 +1614,9 @@ BOOST_AUTO_TEST_CASE(gemm_perm) {
   }
 
   BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(result_norms.size()),
+                    result_norms.size() > 0
+                        ? float(zero_tile_count) / float(result_norms.size())
+                        : 0,
                     tolerance);
 }
 

From e05d908469e738469ff2209ad650e221010099a6 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 14:07:42 -0500
Subject: [PATCH 06/20] detail::permute works for zero-volume result/arg

---
 src/TiledArray/tensor/permute.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h
index 43fbfc9328..4d46907172 100644
--- a/src/TiledArray/tensor/permute.h
+++ b/src/TiledArray/tensor/permute.h
@@ -127,6 +127,9 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
   const unsigned int ndim1 = ndim - 1;
   const auto volume = arg0.range().volume();
 
+  // handle the corner case of empty result/args
+  if (volume == 0) return;
+
   // Get pointer to arg extent
   const auto* MADNESS_RESTRICT const arg0_extent = arg0.range().extent_data();
 

From ae41f770241a5e4a3a97b424c6b002b7e9128fdb Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 15:52:19 -0500
Subject: [PATCH 07/20] BlockRange extended for zero-volume Range

---
 src/TiledArray/block_range.h | 27 +++++++++++++++++----------
 tests/block_range.cpp        | 26 ++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/TiledArray/block_range.h b/src/TiledArray/block_range.h
index 08096c96ea..06f8ecd629 100644
--- a/src/TiledArray/block_range.h
+++ b/src/TiledArray/block_range.h
@@ -85,7 +85,7 @@ class BlockRange : public Range {
       upper[d] = upper_bound_d;
       // Check input dimensions
       TA_ASSERT(lower[d] >= range.lobound(d));
-      TA_ASSERT(lower[d] < upper[d]);
+      TA_ASSERT(lower[d] <= upper[d]);
       TA_ASSERT(upper[d] <= range.upbound(d));
       extent[d] = upper[d] - lower[d];
       TA_ASSERT(extent[d] ==
@@ -132,7 +132,7 @@ class BlockRange : public Range {
       upper[d] = upper_bound_d;
       // Check input dimensions
       TA_ASSERT(lower[d] >= range.lobound(d));
-      TA_ASSERT(lower[d] < upper[d]);
+      TA_ASSERT(lower[d] <= upper[d]);
       TA_ASSERT(upper[d] <= range.upbound(d));
       extent[d] = upper[d] - lower[d];
       TA_ASSERT(extent[d] ==
@@ -177,9 +177,10 @@ class BlockRange : public Range {
   /// \param range the host Range
   /// \param lower_bound A sequence of lower bounds for each dimension
   /// \param upper_bound A sequence of upper bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported
   /// \throw TiledArray::Exception When the size of \p lower_bound is not
   /// equal to that of \p upper_bound.
-  /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]`
+  /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]`
   // clang-format on
   template <typename Index1, typename Index2,
             typename = std::enable_if_t<detail::is_integral_range_v<Index1> &&
@@ -204,9 +205,10 @@ class BlockRange : public Range {
   /// \param range the host Range
   /// \param lower_bound An initializer list of lower bounds for each dimension
   /// \param upper_bound An initializer list of upper bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported
   /// \throw TiledArray::Exception When the size of \p lower_bound is not
   /// equal to that of \p upper_bound.
-  /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]`
+  /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]`
   // clang-format on
   template <typename Index1, typename Index2,
             typename = std::enable_if_t<std::is_integral_v<Index1> &&
@@ -247,7 +249,8 @@ class BlockRange : public Range {
   /// \endcode
   /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v )
   /// \param bounds A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bounds[i].lower>=bounds[i].upper` for any \c i .
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename PairRange,
             typename = std::enable_if_t<detail::is_gpair_range_v<PairRange>>>
@@ -264,8 +267,9 @@ class BlockRange : public Range {
   ///   BlockRange br0(r, {std::make_pair(0,4), std::pair{1,6}, std::pair(2,8)});
   /// \endcode
   /// \tparam GPair a generalized pair of integral types
-  /// \param bound A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i .
+  /// \param bounds A range of {lower,upper} bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename GPair>
   BlockRange(const Range& range, const std::initializer_list<GPair>& bounds,
@@ -290,8 +294,9 @@ class BlockRange : public Range {
   ///   BlockRange br0(r, {{0,4}, {1,6}, {2,8}});
   /// \endcode
   /// \tparam Index An integral type
-  /// \param bound A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i .
+  /// \param bounds A range of {lower,upper} bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename Index,
             typename = std::enable_if_t<std::is_integral_v<Index>>>
@@ -354,6 +359,8 @@ class BlockRange : public Range {
   /// \return The ordinal index in the
   /// \throw TiledArray::Exception When \c index is not included in this range
   ordinal_type ordinal(ordinal_type ord) const {
+    // ordinals are useless for zero-volume ranges
+    TA_ASSERT(volume() != 0);
     // Check that ord is contained by this range.
     TA_ASSERT(Range::includes_ordinal(ord));
 
@@ -414,7 +421,7 @@ class BlockRange : public Range {
   template <typename Archive>
   void serialize(Archive& ar) const {
     Range::serialize(ar);
-    ar& block_offset_;
+    ar & block_offset_;
   }
 };  // BlockRange
 
diff --git a/tests/block_range.cpp b/tests/block_range.cpp
index 135c36d0b4..5d8431fa41 100644
--- a/tests/block_range.cpp
+++ b/tests/block_range.cpp
@@ -72,7 +72,7 @@ BOOST_AUTO_TEST_CASE(block_zero_lower_bound) {
       for (unsigned int i = 0u; i < upper.size(); ++i) ++(upper[i]);
 
       if (std::equal(lower.begin(), lower.end(), upper.begin(),
-                     [](std::size_t l, std::size_t r0) { return l < r0; })) {
+                     [](std::size_t l, std::size_t r0) { return l <= r0; })) {
         if (count_valid == target_count) continue;
         ++count_valid;
 
@@ -141,7 +141,7 @@ BOOST_AUTO_TEST_CASE(block) {
       for (unsigned int i = 0u; i < r.rank(); ++i) ++(upper[i]);
 
       if (std::equal(lower.begin(), lower.end(), upper.begin(),
-                     [](std::size_t l, std::size_t r) { return l < r; })) {
+                     [](std::size_t l, std::size_t r) { return l <= r; })) {
         if (count_valid == target_count) continue;
         ++count_valid;
 
@@ -269,4 +269,26 @@ BOOST_AUTO_TEST_CASE(block) {
 end:;
 }
 
+BOOST_AUTO_TEST_CASE(empty_trange1) {
+  using TiledArray::eigen::iv;
+  // host range is non-empty but one of the dimensions will have no tiles
+  {
+    BOOST_CHECK_NO_THROW(BlockRange(r, iv(3, 3, 3), iv(4, 3, 5)));
+    BlockRange br(r, iv(3, 3, 3), iv(4, 3, 5));
+    BOOST_CHECK_EQUAL(br.volume(), 0);
+    BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception);
+  }
+
+  // host range is non-empty but one of the dimensions will have no tiles
+  {
+    BOOST_CHECK_NO_THROW(
+        BlockRange(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0),
+                   iv(1, 0, 1)));
+    BlockRange br(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0),
+                  iv(1, 0, 1));
+    BOOST_CHECK_EQUAL(br.volume(), 0);
+    BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 463f6a75254dcc325e9b0a37c21f84a3723dfd0f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 16:47:08 -0500
Subject: [PATCH 08/20] SparseShape::block() supports zero-volume blocks

---
 src/TiledArray/sparse_shape.h |  4 ++--
 tests/sparse_shape.cpp        | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h
index f7cc9355f7..a7df1c520c 100644
--- a/src/TiledArray/sparse_shape.h
+++ b/src/TiledArray/sparse_shape.h
@@ -840,7 +840,7 @@ class SparseShape {
 
       // Check that the input indices are in range
       TA_ASSERT(lower_d >= tile_norms_.range().lobound(d));
-      TA_ASSERT(lower_d < upper_d);
+      TA_ASSERT(lower_d <= upper_d);
       TA_ASSERT(upper_d <= tile_norms_.range().upbound(d));
 
       // Construct the size vector for rank i
@@ -874,7 +874,7 @@ class SparseShape {
 
       // Check that the input indices are in range
       TA_ASSERT(lower_d >= tile_norms_.range().lobound(d));
-      TA_ASSERT(lower_d < upper_d);
+      TA_ASSERT(lower_d <= upper_d);
       TA_ASSERT(upper_d <= tile_norms_.range().upbound(d));
 
       // Construct the size vector for rank i
diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp
index 64116dd687..a79d7ceb8e 100644
--- a/tests/sparse_shape.cpp
+++ b/tests/sparse_shape.cpp
@@ -276,7 +276,7 @@ BOOST_AUTO_TEST_CASE(block) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
 
   for (auto lower_it = tr.tiles_range().begin();
        lower_it != tr.tiles_range().end(); ++lower_it) {
@@ -287,7 +287,7 @@ BOOST_AUTO_TEST_CASE(block) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper));
@@ -369,7 +369,7 @@ BOOST_AUTO_TEST_CASE(block_scale) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const float factor = 3.3;
 
   for (auto lower_it = tr.tiles_range().begin();
@@ -381,7 +381,7 @@ BOOST_AUTO_TEST_CASE(block_scale) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result =
@@ -468,7 +468,7 @@ BOOST_AUTO_TEST_CASE(block_perm) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const auto inv_perm = perm.inv();
 
   for (auto lower_it = tr.tiles_range().begin();
@@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE(block_perm) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper, perm));
@@ -569,7 +569,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const float factor = 3.3;
   const auto inv_perm = perm.inv();
 
@@ -582,7 +582,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(

From a8d39125da85229a57444b62be9df0b7997c41e0 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 14:15:51 -0500
Subject: [PATCH 09/20] [unit] expressions suite: tensor_factories tests
 DistArrays with empty trange1

---
 src/TiledArray/expressions/blk_tsr_engine.h | 81 ++++++++++++++-------
 src/TiledArray/expressions/blk_tsr_expr.h   |  2 +-
 tests/expressions_fixture.h                 | 12 ++-
 tests/expressions_impl.h                    | 26 +++++++
 4 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h
index 2d16172dbe..31ad29ee74 100644
--- a/src/TiledArray/expressions/blk_tsr_engine.h
+++ b/src/TiledArray/expressions/blk_tsr_engine.h
@@ -194,16 +194,19 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
       const auto lower_d = lower[d];
       const auto upper_d = upper[d];
 
-      // Copy and shift the tiling for the block
-      auto i = lower_d;
-      const auto base_d = trange[d].tile(i).first;
-      trange1_data.emplace_back(0ul);
-      for (; i < upper_d; ++i)
-        trange1_data.emplace_back(trange[d].tile(i).second - base_d);
-
-      // Add the trange1 to the tiled range data
-      trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
-      trange1_data.resize(0ul);
+      // Copy and shift the tiling for the block, if nonempty
+      if (lower_d != upper_d) {
+        auto i = lower_d;
+        const auto base_d = trange[d].tile(i).first;
+        trange1_data.emplace_back(0ul);
+        for (; i < upper_d; ++i)
+          trange1_data.emplace_back(trange[d].tile(i).second - base_d);
+        // Add the trange1 to the tiled range data
+        trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
+        trange1_data.resize(0ul);
+      } else {
+        trange_data.emplace_back();
+      }
     }
 
     return TiledRange(trange_data.begin(), trange_data.end());
@@ -233,16 +236,19 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
       const auto lower_i = lower[inv_perm_d];
       const auto upper_i = upper[inv_perm_d];
 
-      // Copy, shift, and permute the tiling of the block
-      auto i = lower_i;
-      const auto base_d = trange[inv_perm_d].tile(i).first;
-      trange1_data.emplace_back(0ul);
-      for (; i < upper_i; ++i)
-        trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d);
-
-      // Add the trange1 to the tiled range data
-      trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
-      trange1_data.resize(0ul);
+      if (lower_i != upper_i) {
+        // Copy, shift, and permute the tiling of the block
+        auto i = lower_i;
+        const auto base_d = trange[inv_perm_d].tile(i).first;
+        trange1_data.emplace_back(0ul);
+        for (; i < upper_i; ++i)
+          trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d);
+
+        // Add the trange1 to the tiled range data
+        trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
+        trange1_data.resize(0ul);
+      } else
+        trange_data.emplace_back();
     }
 
     return TiledRange(trange_data.begin(), trange_data.end());
@@ -376,12 +382,18 @@ class BlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the range shift vector
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift.emplace_back(-base_d);
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        const auto base_d = trange[d].tile(lower_d).first;
+        range_shift.emplace_back(-base_d);
+      } else {
+        range_shift.emplace_back(0l);
+      }
     }
 
     return op_type(op_base_type(range_shift));
@@ -402,6 +414,7 @@ class BlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the permuted range shift vector
     auto outer_perm = outer(perm);
@@ -409,8 +422,11 @@ class BlkTsrEngine
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto perm_d = outer_perm[d];
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift[perm_d] = -base_d;
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        const auto base_d = trange[d].tile(lower_d).first;
+        range_shift[perm_d] = -base_d;
+      }
     }
 
     return op_type(op_base_type(range_shift), perm);
@@ -522,12 +538,17 @@ class ScalBlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Construct the inverse permutation
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift.emplace_back(-base_d);
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        const auto base_d = trange[d].tile(lower_d).first;
+        range_shift.emplace_back(-base_d);
+      } else
+        range_shift.emplace_back(0);
     }
 
     return op_type(op_base_type(range_shift, factor_));
@@ -548,6 +569,7 @@ class ScalBlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the permuted range shift vector
     auto outer_perm = outer(perm);
@@ -555,8 +577,11 @@ class ScalBlkTsrEngine
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto perm_d = outer_perm[d];
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift[perm_d] = -base_d;
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        const auto base_d = trange[d].tile(lower_d).first;
+        range_shift[perm_d] = -base_d;
+      }
     }
 
     return op_type(op_base_type(range_shift, factor_), perm);
diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h
index d32603b58f..5d6612d5cc 100644
--- a/src/TiledArray/expressions/blk_tsr_expr.h
+++ b/src/TiledArray/expressions/blk_tsr_expr.h
@@ -179,7 +179,7 @@ class BlkTsrExprBase : public Expr<Derived> {
     const bool lower_upper_bound_check =
         std::equal(std::begin(lower_bound_), std::end(lower_bound_),
                    std::begin(upper_bound_),
-                   [](std::size_t l, std::size_t r) { return l < r; });
+                   [](std::size_t l, std::size_t r) { return l <= r; });
     if (!lower_upper_bound_check) {
       if (TiledArray::get_default_world().rank() == 0) {
         using TiledArray::operator<<;
diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h
index 3e493b1200..a4f7e4cd0b 100644
--- a/tests/expressions_fixture.h
+++ b/tests/expressions_fixture.h
@@ -62,9 +62,11 @@ struct ExpressionsFixture : public TiledRangeFixture {
         s_tr1_1(make_random_sparseshape(trange1)),
         s_tr1_2(make_random_sparseshape(trange1)),
         s_tr2(make_random_sparseshape(trange2)),
+        s_trC(make_random_sparseshape(trangeC)),
         a(*GlobalFixture::world, tr, s_tr_1),
         b(*GlobalFixture::world, tr, s_tr_2),
         c(*GlobalFixture::world, tr, s_tr_2),
+        aC(*GlobalFixture::world, trangeC, s_trC),
         u(*GlobalFixture::world, trange1, s_tr1_1),
         v(*GlobalFixture::world, trange1, s_tr1_2),
         w(*GlobalFixture::world, trange2, s_tr2) {
@@ -72,6 +74,7 @@ struct ExpressionsFixture : public TiledRangeFixture {
     random_fill(b);
     random_fill(u);
     random_fill(v);
+    random_fill(aC);
     GlobalFixture::world->gop.fence();
     a.truncate();
     b.truncate();
@@ -88,11 +91,13 @@ struct ExpressionsFixture : public TiledRangeFixture {
         c(*GlobalFixture::world, tr),
         u(*GlobalFixture::world, trange1),
         v(*GlobalFixture::world, trange1),
-        w(*GlobalFixture::world, trange2) {
+        w(*GlobalFixture::world, trange2),
+        aC(*GlobalFixture::world, trangeC) {
     random_fill(a);
     random_fill(b);
     random_fill(u);
     random_fill(v);
+    random_fill(aC);
     GlobalFixture::world->gop.fence();
   }
 
@@ -213,17 +218,22 @@ struct ExpressionsFixture : public TiledRangeFixture {
   const TiledRange trange1{{0, 2, 5, 10, 17, 28, 41}};
   const TiledRange trange2{{0, 2, 5, 10, 17, 28, 41},
                            {0, 3, 6, 11, 18, 29, 42}};
+  // contains empty trange1
+  const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{},
+                           TiledRange1{0, 2, 7, 11}};
   SparseShape<float> s_tr_1;
   SparseShape<float> s_tr_2;
   SparseShape<float> s_tr1_1;
   SparseShape<float> s_tr1_2;
   SparseShape<float> s_tr2;
+  SparseShape<float> s_trC;
   TArray a;
   TArray b;
   TArray c;
   TArray u;
   TArray v;
   TArray w;
+  TArray aC;
 };  // ExpressionsFixture
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index 91bcb10cc4..6fdfc2ce0e 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -31,6 +31,7 @@ constexpr int nrepeats = 5;
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) {
   auto& a = F::a;
   auto& c = F::c;
+  auto& aC = F::aC;
 
   const auto& ca = a;
   const std::array<int, 3> lobound{{3, 3, 3}};
@@ -2941,6 +2942,31 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) {
   BOOST_CHECK_EQUAL(result, expected);
 }
 
+// corner case: expressions involving array with empty trange1
+BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
+  auto& c = F::c;
+  auto& aC = F::aC;
+
+  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c"));
+  BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c"));
+  BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c"));
+  BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c"));
+  BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj());
+  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b"));
+  BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj());
+  BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj());
+
+  using TiledArray::eigen::iv;
+  const std::array<int, 3> lobound{{0, 0, 1}};
+  const std::array<int, 3> upbound{{1, 0, 2}};
+
+  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound));
+  BOOST_CHECK_NO_THROW(c("a,b,c") +=
+                       2 * aC("a,b,c").block(lobound, upbound).conj());
+  BOOST_CHECK_NO_THROW(c("a,b,c") =
+                           2 * conj(aC("a,c,b").block(lobound, upbound)));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_IMPL_H

From df7e0c804dfa6f5901fa4e6bedbeb1993e2a5286 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 15 Feb 2024 23:46:31 -0500
Subject: [PATCH 10/20] GEMM ld{a,b,c} must be >= 1

see https://netlib.org/lapack/explore-html/dd/d09/group__gemm_ga1e899f8453bcbfde78e91a86a2dab984.html#ga1e899f8453bcbfde78e91a86a2dab984
---
 src/TiledArray/device/btas.h   | 26 ++++++++++++++++----------
 src/TiledArray/external/btas.h | 26 ++++++++++++++++----------
 src/TiledArray/tensor/tensor.h | 14 +++++++++-----
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h
index acd42341fd..b30fdd4edd 100644
--- a/src/TiledArray/device/btas.h
+++ b/src/TiledArray/device/btas.h
@@ -77,10 +77,12 @@ ::btas::Tensor<T, Range, Storage> gemm(
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   T factor_t = T(factor);
   T zero(0);
@@ -112,10 +114,11 @@ ::btas::Tensor<T, Range, Storage> gemm(
 
     static_assert(::btas::boxrange_iteration_order<Range>::value ==
                   ::btas::boxrange_iteration_order<Range>::row_major);
+    const integer ldc = std::max(integer{1}, n);
     blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(),
                gemm_helper.left_op(), n, m, k, factor_t,
                device_data(right.storage()), ldb, device_data(left.storage()),
-               lda, zero, device_data(result.storage()), n, queue);
+               lda, zero, device_data(result.storage()), ldc, queue);
 
     device::sync_madness_task_with(stream);
   }
@@ -185,10 +188,12 @@ void gemm(::btas::Tensor<T, Range, Storage> &result,
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   auto &queue = blasqueue_for(result.range());
   const auto stream = device::Stream(queue.device(), queue.stream());
@@ -207,10 +212,11 @@ void gemm(::btas::Tensor<T, Range, Storage> &result,
 
     static_assert(::btas::boxrange_iteration_order<Range>::value ==
                   ::btas::boxrange_iteration_order<Range>::row_major);
+    const integer ldc = std::max(integer{1}, n);
     blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(),
                gemm_helper.left_op(), n, m, k, factor_t,
                device_data(right.storage()), ldb, device_data(left.storage()),
-               lda, one, device_data(result.storage()), n, queue);
+               lda, one, device_data(result.storage()), ldc, queue);
     device::sync_madness_task_with(stream);
   }
 }
diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h
index 11971c269e..fe84e6f0c6 100644
--- a/src/TiledArray/external/btas.h
+++ b/src/TiledArray/external/btas.h
@@ -661,16 +661,19 @@ inline btas::Tensor<T, Range, Storage> gemm(
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   T factor_t(factor);
 
+  const integer ldc = std::max(integer{1}, n);
   TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m,
                                n, k, factor_t, left.data(), lda, right.data(),
-                               ldb, T(0), result.data(), n);
+                               ldb, T(0), result.data(), ldc);
 
   return result;
 }
@@ -736,16 +739,19 @@ inline void gemm(btas::Tensor<T, Range, Storage>& result,
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   T factor_t(factor);
 
+  const integer ldc = std::max(integer{1}, n);
   TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m,
                                n, k, factor_t, left.data(), lda, right.data(),
-                               ldb, T(1), result.data(), n);
+                               ldb, T(1), result.data(), ldc);
 }
 
 // sum of the hyperdiagonal elements
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 9c36b071cc..c12c2c15d1 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -2648,10 +2648,13 @@ void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
     gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range());
 
     // Get the leading dimension for left and right matrices.
-    const integer lda =
-        (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m);
-    const integer ldb =
-        (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k);
+    const integer lda = std::max(
+        integer{1},
+        (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m));
+    const integer ldb = std::max(
+        integer{1},
+        (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n
+                                                                       : k));
 
     // may need to split gemm into multiply + accumulate for tracing purposes
 #ifdef TA_ENABLE_TILE_OPS_LOGGING
@@ -2719,8 +2722,9 @@ void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
       }
     }
 #else   // TA_ENABLE_TILE_OPS_LOGGING
+    const integer ldc = std::max(integer{1}, n);
     math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k,
-                     alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n);
+                     alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
 #endif  // TA_ENABLE_TILE_OPS_LOGGING
   }
 }

From e1883fecc61ecc9ff4cb341cc18d374cd8763a45 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 16 Feb 2024 00:01:53 -0500
Subject: [PATCH 11/20] [unit] expression suite: can contract + reduce
 zero-volume DistArrays

---
 src/TiledArray/dist_eval/contraction_eval.h |  8 ++--
 src/TiledArray/pmap/cyclic_pmap.h           |  4 --
 src/TiledArray/proc_grid.h                  |  6 ---
 tests/cyclic_pmap.cpp                       |  6 ---
 tests/expressions_impl.h                    | 50 +++++++++++++++------
 5 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
index 2da66628fc..4755538689 100644
--- a/src/TiledArray/dist_eval/contraction_eval.h
+++ b/src/TiledArray/dist_eval/contraction_eval.h
@@ -891,9 +891,9 @@ class Summa
   ordinal_type initialize(const DenseShape&) {
     // Construct static broadcast groups for dense arguments
     const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
-    col_group_ = proc_grid_.make_col_group(col_did);
+    if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did);
     const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
-    row_group_ = proc_grid_.make_row_group(row_did);
+    if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did);
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
     std::stringstream ss;
@@ -1347,7 +1347,6 @@ class Summa
 
     template <typename Derived>
     void make_next_step_tasks(Derived* task, ordinal_type depth) {
-      TA_ASSERT(depth > 0);
       // Set the depth to be no greater than the maximum number steps
       if (depth > owner_->k_) depth = owner_->k_;
 
@@ -1706,6 +1705,9 @@ class Summa
           std::max(ProcGrid::size_type(2),
                    std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
 
+      // corner case: empty result
+      if (k_ == 0) return 0;
+
       // Construct the first SUMMA iteration task
       if (TensorImpl_::shape().is_dense()) {
         // We cannot have more iterations than there are blocks in the k
diff --git a/src/TiledArray/pmap/cyclic_pmap.h b/src/TiledArray/pmap/cyclic_pmap.h
index 6d2df0088b..250b4f677b 100644
--- a/src/TiledArray/pmap/cyclic_pmap.h
+++ b/src/TiledArray/pmap/cyclic_pmap.h
@@ -84,10 +84,6 @@ class CyclicPmap : public Pmap {
         cols_(cols),
         proc_cols_(proc_cols),
         proc_rows_(proc_rows) {
-    // Check that the size is non-zero
-    TA_ASSERT(rows_ >= 1ul);
-    TA_ASSERT(cols_ >= 1ul);
-
     // Check limits of process rows and columns
     TA_ASSERT(proc_rows_ >= 1ul);
     TA_ASSERT(proc_cols_ >= 1ul);
diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h
index a401e0ac1e..cd15c1b73e 100644
--- a/src/TiledArray/proc_grid.h
+++ b/src/TiledArray/proc_grid.h
@@ -288,12 +288,6 @@ class ProcGrid {
         local_rows_(0ul),
         local_cols_(0ul),
         local_size_(0ul) {
-    // Check for non-zero sizes
-    TA_ASSERT(rows_ >= 1u);
-    TA_ASSERT(cols_ >= 1u);
-    TA_ASSERT(row_size >= 1ul);
-    TA_ASSERT(col_size >= 1ul);
-
     init(world_->rank(), world_->size(), row_size, col_size);
   }
 
diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp
index 4d8d76da1f..b8c2b9670c 100644
--- a/tests/cyclic_pmap.cpp
+++ b/tests/cyclic_pmap.cpp
@@ -60,12 +60,6 @@ BOOST_AUTO_TEST_CASE(constructor) {
 
   ProcessID size = GlobalFixture::world->size();
 
-  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
-                            *GlobalFixture::world, 0ul, 10ul, 1, 1),
-                        TiledArray::Exception);
-  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
-                            *GlobalFixture::world, 10ul, 0ul, 1, 1),
-                        TiledArray::Exception);
   BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
                             *GlobalFixture::world, 10ul, 10ul, 0, 1),
                         TiledArray::Exception);
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index 6fdfc2ce0e..ea4beab3d6 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -2947,24 +2947,48 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
   auto& c = F::c;
   auto& aC = F::aC;
 
-  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c"));
-  BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c"));
-  BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c"));
-  BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c"));
-  BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj());
-  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b"));
-  BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj());
-  BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj());
+  // unary/binary expressions
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj());
+  }
 
   using TiledArray::eigen::iv;
   const std::array<int, 3> lobound{{0, 0, 1}};
   const std::array<int, 3> upbound{{1, 0, 2}};
 
-  BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound));
-  BOOST_CHECK_NO_THROW(c("a,b,c") +=
-                       2 * aC("a,b,c").block(lobound, upbound).conj());
-  BOOST_CHECK_NO_THROW(c("a,b,c") =
-                           2 * conj(aC("a,c,b").block(lobound, upbound)));
+  // unary/binary block expressions
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound));
+    BOOST_CHECK_NO_THROW(c("a,b,c") +=
+                         2 * aC("a,b,c").block(lobound, upbound).conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") =
+                             2 * conj(aC("a,c,b").block(lobound, upbound)));
+  }
+
+  // contraction expressions
+  {
+    std::decay_t<decltype(c)> t2, t4;
+    // contraction over empty dim
+    BOOST_CHECK_NO_THROW(t4("a,c,e,d") = aC("a,b,c") * aC("d,b,e"));
+    // contraction over empty and nonempty dims
+    BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c"));
+    // contraction over nonempty dims
+    BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c"));
+  }
+
+  // reduction expressions
+  {
+    // contraction over empty dim
+    BOOST_CHECK_NO_THROW(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get());
+    BOOST_CHECK_EQUAL(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get(), 0);
+  }
 }
 
 BOOST_AUTO_TEST_SUITE_END()

From fa830a183178a68013e42abc6ecda3f329eff4e8 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 16 Feb 2024 09:24:02 -0500
Subject: [PATCH 12/20] expression suite: fixups for contractions w/
 zero-volume contraction range and w/ nonzero-volume contraction range
 producing zero-volume result

---
 src/TiledArray/dist_eval/contraction_eval.h | 184 ++++++++++++--------
 src/TiledArray/expressions/cont_engine.h    |  30 ++--
 tests/expressions_fixture.h                 |  12 +-
 tests/expressions_impl.h                    |   3 +
 4 files changed, 146 insertions(+), 83 deletions(-)

diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
index 4755538689..a747c0748b 100644
--- a/src/TiledArray/dist_eval/contraction_eval.h
+++ b/src/TiledArray/dist_eval/contraction_eval.h
@@ -889,45 +889,63 @@ class Summa
 
   /// Initialize reduce tasks and construct broadcast groups
   ordinal_type initialize(const DenseShape&) {
-    // Construct static broadcast groups for dense arguments
-    const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
-    if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did);
-    const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
-    if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did);
+    // if contraction is over zero-volume range just initialize tiles to zero
+    if (k_ == 0) {
+      ordinal_type tile_count = 0;
+      const auto& tiles_range = this->trange().tiles_range();
+      for (auto&& tile_idx : tiles_range) {
+        auto tile_ord = tiles_range.ordinal(tile_idx);
+        if (this->is_local(tile_ord)) {
+          this->world().taskq.add([this, tile_ord, tile_idx]() {
+            this->set_tile(tile_ord,
+                           value_type(this->trange().tile(tile_idx),
+                                      typename value_type::value_type{}));
+          });
+          ++tile_count;
+        }
+      }
+      return tile_count;
+    } else {
+      // Construct static broadcast groups for dense arguments
+      const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
+      col_group_ = proc_grid_.make_col_group(col_did);
+      const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
+      row_group_ = proc_grid_.make_row_group(row_did);
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-    std::stringstream ss;
-    ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
-       << col_did.first << ", " << col_did.second << ") { ";
-    for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
-      ss << col_group_.world_rank(gproc) << " ";
-    ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
-       << ") { ";
-    for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
-      ss << row_group_.world_rank(gproc) << " ";
-    ss << "}\n";
-    printf(ss.str().c_str());
+      std::stringstream ss;
+      ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
+         << col_did.first << ", " << col_did.second << ") { ";
+      for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
+        ss << col_group_.world_rank(gproc) << " ";
+      ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
+         << ") { ";
+      for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
+        ss << row_group_.world_rank(gproc) << " ";
+      ss << "}\n";
+      printf(ss.str().c_str());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
-    // Allocate memory for the reduce pair tasks.
-    std::allocator<ReducePairTask<op_type>> alloc;
-    reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
+      // Allocate memory for the reduce pair tasks.
+      std::allocator<ReducePairTask<op_type>> alloc;
+      reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
 
-    // Iterate over all local tiles
-    const ordinal_type n = proc_grid_.local_size();
-    for (ordinal_type t = 0ul; t < n; ++t) {
-      // Initialize the reduction task
-      ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
-          reduce_tasks_ + t;
-      new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
+      // Iterate over all local tiles
+      const ordinal_type n = proc_grid_.local_size();
+      for (ordinal_type t = 0ul; t < n; ++t) {
+        // Initialize the reduction task
+        ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
+            reduce_tasks_ + t;
+        new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-                                                ,
-                                                nullptr, t
+                                                  ,
+                                                  nullptr, t
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-      );
-    }
+        );
+      }
 
-    return proc_grid_.local_size();
+      return proc_grid_.local_size();
+    }
   }
 
   /// Initialize reduce tasks
@@ -938,6 +956,9 @@ class Summa
     ss << "    initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
+    // fast return if there is no work to do
+    if (k_ == 0) return 0;
+
     // Allocate memory for the reduce pair tasks.
     std::allocator<ReducePairTask<op_type>> alloc;
     reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
@@ -1705,60 +1726,79 @@ class Summa
           std::max(ProcGrid::size_type(2),
                    std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
 
-      // corner case: empty result
-      if (k_ == 0) return 0;
-
-      // Construct the first SUMMA iteration task
-      if (TensorImpl_::shape().is_dense()) {
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
-
-        // Modify the number of concurrent iterations based on the available
-        // memory.
-        depth = mem_bound_depth(depth, 0.0f, 0.0f);
-
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
-
-        TensorImpl_::world().taskq.add(
-            new DenseStepTask(shared_from_this(), depth));
-      } else {
-        // Increase the depth based on the amount of sparsity in an iteration.
+      // watch out for the corner case: contraction over zero-volume range
+      // producing nonzero-volume result ... in that case there is nothing to do
+      // the appropriate initialization was performed in the initialize() method
+      if (k_ != 0) {
+        // Construct the first SUMMA iteration task
+        if (TensorImpl_::shape().is_dense()) {
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
+
+          // Modify the number of concurrent iterations based on the available
+          // memory.
+          depth = mem_bound_depth(depth, 0.0f, 0.0f);
+
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
+
+          TensorImpl_::world().taskq.add(
+              new DenseStepTask(shared_from_this(), depth));
+        } else {
+          // Increase the depth based on the amount of sparsity in an iteration.
 
-        // Get the sparsity fractions for the left- and right-hand arguments.
-        const float left_sparsity = left_.shape().sparsity();
-        const float right_sparsity = right_.shape().sparsity();
+          // Get the sparsity fractions for the left- and right-hand arguments.
+          const float left_sparsity = left_.shape().sparsity();
+          const float right_sparsity = right_.shape().sparsity();
 
-        // Compute the fraction of non-zero result tiles in a single SUMMA
-        // iteration.
-        const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
-                                    (1.0f - std::min(right_sparsity, 0.9f));
+          // Compute the fraction of non-zero result tiles in a single SUMMA
+          // iteration.
+          const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
+                                      (1.0f - std::min(right_sparsity, 0.9f));
 
-        // Compute the new depth based on sparsity of the arguments
-        depth =
-            float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
+          // Compute the new depth based on sparsity of the arguments
+          depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) +
+                  0.5f;
 
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
 
-        // Modify the number of concurrent iterations based on the available
-        // memory and sparsity of the argument tensors.
-        depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
+          // Modify the number of concurrent iterations based on the available
+          // memory and sparsity of the argument tensors.
+          depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
 
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
 
-        TensorImpl_::world().taskq.add(
-            new SparseStepTask(shared_from_this(), depth));
-      }
+          TensorImpl_::world().taskq.add(
+              new SparseStepTask(shared_from_this(), depth));
+        }
+      }  // k_ != 0
     }
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
     printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
 
+    // corner case: if left or right are zero-volume no tasks were scheduled, so
+    // need to discard all of their tiles manually
+    if (left_.range().volume() == 0) {
+      for (auto&& tile_idx : right_.range()) {
+        auto tile_ord = right_.range().ordinal(tile_idx);
+        if (right_.is_local(tile_ord) && !right_.is_zero(tile_ord))
+          right_.discard(tile_ord);
+      }
+    }
+    if (right_.range().volume() == 0) {
+      for (auto&& tile_idx : left_.range()) {
+        auto tile_ord = left_.range().ordinal(tile_idx);
+        if (left_.is_local(tile_ord) && !left_.is_zero(tile_ord))
+          left_.discard(tile_ord);
+      }
+    }
+
     // Wait for child tensors to be evaluated, and process tasks while waiting.
     left_.wait();
     right_.wait();
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 8795e699c6..94562b5154 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -343,16 +343,26 @@ class ContEngine : public BinaryEngine<Derived> {
       n *= right_element_size[i];
     }
 
-    // Construct the process grid.
-    proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
-
-    // Initialize children
-    left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
-    right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
-
-    // Initialize the process map if not already defined
-    if (!pmap) pmap = proc_grid_.make_pmap();
-    ExprEngine_::init_distribution(world, pmap);
+    // corner case: zero-volume result ... easier to skip proc_grid_
+    // construction alltogether
+    if (M == 0 || N == 0) {
+      left_.init_distribution(world, {});
+      right_.init_distribution(world, {});
+      ExprEngine_::init_distribution(
+          world, (pmap ? pmap : policy::default_pmap(*world, M * N)));
+    } else {  // M!=0 && N!=0
+
+      // Construct the process grid.
+      proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
+
+      // Initialize children
+      left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
+      right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
+
+      // Initialize the process map if not already defined
+      if (!pmap) pmap = proc_grid_.make_pmap();
+      ExprEngine_::init_distribution(world, pmap);
+    }
   }
 
   /// Tiled range factory function
diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h
index a4f7e4cd0b..8e527465d1 100644
--- a/tests/expressions_fixture.h
+++ b/tests/expressions_fixture.h
@@ -63,10 +63,12 @@ struct ExpressionsFixture : public TiledRangeFixture {
         s_tr1_2(make_random_sparseshape(trange1)),
         s_tr2(make_random_sparseshape(trange2)),
         s_trC(make_random_sparseshape(trangeC)),
+        s_trC_f(make_random_sparseshape(trangeC_f)),
         a(*GlobalFixture::world, tr, s_tr_1),
         b(*GlobalFixture::world, tr, s_tr_2),
         c(*GlobalFixture::world, tr, s_tr_2),
         aC(*GlobalFixture::world, trangeC, s_trC),
+        aC_f(*GlobalFixture::world, trangeC_f, s_trC_f),
         u(*GlobalFixture::world, trange1, s_tr1_1),
         v(*GlobalFixture::world, trange1, s_tr1_2),
         w(*GlobalFixture::world, trange2, s_tr2) {
@@ -92,12 +94,14 @@ struct ExpressionsFixture : public TiledRangeFixture {
         u(*GlobalFixture::world, trange1),
         v(*GlobalFixture::world, trange1),
         w(*GlobalFixture::world, trange2),
-        aC(*GlobalFixture::world, trangeC) {
+        aC(*GlobalFixture::world, trangeC),
+        aC_f(*GlobalFixture::world, trangeC_f) {
     random_fill(a);
     random_fill(b);
     random_fill(u);
     random_fill(v);
     random_fill(aC);
+    random_fill(aC_f);
     GlobalFixture::world->gop.fence();
   }
 
@@ -221,12 +225,17 @@ struct ExpressionsFixture : public TiledRangeFixture {
   // contains empty trange1
   const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{},
                            TiledRange1{0, 2, 7, 11}};
+  // like trC, but with all dimension nonempty
+  const TiledRange trangeC_f{trangeC.dim(0), TiledRange1{0, 4, 7},
+                             trangeC.dim(2)};
+
   SparseShape<float> s_tr_1;
   SparseShape<float> s_tr_2;
   SparseShape<float> s_tr1_1;
   SparseShape<float> s_tr1_2;
   SparseShape<float> s_tr2;
   SparseShape<float> s_trC;
+  SparseShape<float> s_trC_f;
   TArray a;
   TArray b;
   TArray c;
@@ -234,6 +243,7 @@ struct ExpressionsFixture : public TiledRangeFixture {
   TArray v;
   TArray w;
   TArray aC;
+  TArray aC_f;
 };  // ExpressionsFixture
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index ea4beab3d6..268b118568 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -2946,6 +2946,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) {
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
   auto& c = F::c;
   auto& aC = F::aC;
+  auto& aC_f = F::aC_f;
 
   // unary/binary expressions
   {
@@ -2981,6 +2982,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
     BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c"));
     // contraction over nonempty dims
     BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c"));
+    // contraction over nonempty dims, involving expressions with nonzero-volume
+    BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c")));
   }
 
   // reduction expressions

From f0112af07423ce85dfba1cf1f8325a84111b1894 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 16 Feb 2024 10:00:03 -0500
Subject: [PATCH 13/20] [unit] re-add block_range_suite

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3bcf8de967..76bb14e4b1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,6 +34,7 @@ set(executable ta_test)
 set(ta_test_src_files  ta_test.cpp
     range1.cpp
     range.cpp
+    block_range.cpp
     type_traits.cpp
     tensor.cpp
     tensor_of_tensor.cpp

From 087202a0e9b00b10638f78a9926431e9be981f11 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 16 Feb 2024 11:46:31 -0500
Subject: [PATCH 14/20] one more zero-volume corner case, in
 Expr::eval_to(BlkTsrExpr..)

---
 src/TiledArray/expressions/expr.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h
index f77d13dbad..c3fdd6423b 100644
--- a/src/TiledArray/expressions/expr.h
+++ b/src/TiledArray/expressions/expr.h
@@ -504,8 +504,11 @@ class Expr {
     // Move the data from dist_eval into the sub-block of result array.
     // This step may involve communication when the tiles are moved from the
     // sub-block distribution to the array distribution.
-    {
+    // N.B. handle the corner case of zero-volume host array, then no data needs
+    // to be moved
+    if (tsr.array().trange().tiles_range().volume() != 0) {
       // N.B. must deep copy
+      TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound()));
       const container::svector<long> shift =
           tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound();
 

From 5809ff25258a837783983121a8df8ca2d89f4b32 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Mon, 1 Apr 2024 23:29:45 -0400
Subject: [PATCH 15/20] typo

---
 examples/dgemm/ta_blas.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dgemm/ta_blas.cpp b/examples/dgemm/ta_blas.cpp
index 0a4feff383..aeefaae908 100644
--- a/examples/dgemm/ta_blas.cpp
+++ b/examples/dgemm/ta_blas.cpp
@@ -69,7 +69,7 @@ int main(int argc, char** argv) {
   // Start clock
   const double wall_time_start = madness::wall_time();
 
-  // Do matrix multiplcation
+  // Do matrix multiplication
   // Note: If TiledArray has not been configured with blas, this will be an
   // eigen call.
   for (int i = 0; i < repeat; ++i) {

From 0dbd0eee07a44f7b577fdf11038addcc1d466883 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 2 Apr 2024 00:00:50 -0400
Subject: [PATCH 16/20] introduced patcher for Intel MKL's unfair dispatch,
 enabled by configuring with IntelMKL_FAIR_DISPATCH=ON

see https://www.agner.org/optimize/intel_dispatch_patch.zip
---
 CMakeLists.txt                                |  3 +
 INSTALL.md                                    |  1 +
 src/CMakeLists.txt                            | 10 +++
 src/TiledArray/config.h.in                    |  2 +
 .../agnerfog/intel_cpu_feature_patch.c        | 48 +++++++++++
 .../external/agnerfog/intel_mkl_cpuid_patch.c | 61 ++++++++++++++
 .../agnerfog/intel_mkl_feature_patch.c        | 49 +++++++++++
 src/TiledArray/external/agnerfog/readme.txt   | 84 +++++++++++++++++++
 src/TiledArray/tiledarray.cpp                 |  7 ++
 9 files changed, 265 insertions(+)
 create mode 100644 src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
 create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
 create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
 create mode 100644 src/TiledArray/external/agnerfog/readme.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a97e6561f8..7bc524337b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,9 @@ add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library")
 option(TA_TTG "Enable search/build of TTG library" OFF)
 add_feature_info(TA_TTG TA_TTG "TTG library")
 
+option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF)
+add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL")
+
 # Enable shared library support options
 redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF)
 add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED
diff --git a/INSTALL.md b/INSTALL.md
index 6e7c6fc746..3f669073f0 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -423,6 +423,7 @@ support may be added.
 * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
 * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray.
 * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.
+* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF].
 
 # Build TiledArray
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9bb82bf537..0167aab636 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -313,6 +313,16 @@ if( TARGET ttg-parsec )
   list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec)
 endif()
 
+if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL)
+    message(WARNING "created tiledarray_mkl_dispatch")
+    add_library(tiledarray_mkl_dispatch OBJECT
+      TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
+      TiledArray/external/agnerfog/intel_mkl_feature_patch.c
+    )
+    # N.B. --allow-multiple-definition is a GNU linker extension
+    list(APPEND _TILEDARRAY_DEPENDENCIES $<TARGET_OBJECTS:tiledarray_mkl_dispatch> -Wl,--allow-multiple-definition)
+endif()
+
 # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES
 set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on")
 
diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
index 79f9f0932a..483847067f 100644
--- a/src/TiledArray/config.h.in
+++ b/src/TiledArray/config.h.in
@@ -113,6 +113,8 @@
 #endif  // !defined(TILEDARRAY_HAS_BTAS)
 #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL)
 #  define TILEDARRAY_HAS_INTEL_MKL
+/* use fair dispatch in Intel MKL? */
+#cmakedefine IntelMKL_FAIR_DISPATCH
 #endif
 
 /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */
diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
new file mode 100644
index 0000000000..f3706ef1fa
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
@@ -0,0 +1,48 @@
+/***********************  intel_cpu_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel compiler version 13.0 and later, including the general
+ * libraries, LIBM and SVML, but not MKL and VML.
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of generated code with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_cpu_patch();
+ * before any call to the library functions.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to Intel libraries
+extern int64_t __intel_cpu_feature_indicator;    // CPU feature bits
+extern int64_t __intel_cpu_feature_indicator_x;  // CPU feature bits
+void __intel_cpu_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_cpu_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_cpu_patch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_cpu_feature_indicator = 0;
+  __intel_cpu_feature_indicator_x = 0;
+  __intel_cpu_features_init_x();
+  __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
new file mode 100644
index 0000000000..b88a1807f7
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
@@ -0,0 +1,61 @@
+/***********************  intel_mkl_cpuid_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to override Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ *
+ * Include this code in your C or C++ program and make sure it is linked before
+ * any Intel libraries. You may need to include intel_mkl_feature_patch.c as
+ *well.
+ *
+ * Copyright (c) 2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu() { return 1; }
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu_true() { return 1; }
+
+int mkl_serv_cpuhaspnr_true() { return 1; }
+
+int mkl_serv_cpuhaspnr() { return 1; }
+
+int mkl_serv_cpuhasnhm() { return 1; }
+
+int mkl_serv_cpuisbulldozer() { return 0; }
+
+int mkl_serv_cpuiszen() { return 0; }
+
+int mkl_serv_cpuisatomsse4_2() { return 0; }
+
+int mkl_serv_cpuisatomssse3() { return 0; }
+
+int mkl_serv_cpuisitbarcelona() { return 0; }
+
+int mkl_serv_cpuisskl() { return 0; }
+
+int mkl_serv_cpuisknm() { return 0; }
+
+int mkl_serv_cpuisclx() { return 0; }
+
+int mkl_serv_get_microarchitecture() {
+  // I don't know what this number means
+  return 33;
+}
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
new file mode 100644
index 0000000000..4844f2621d
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
@@ -0,0 +1,49 @@
+/***********************  intel_mkl_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_mkl_patch();
+ * before any call to the MKL functions. You may need to include
+ * intel_mkl_cpuid_patch.c as well.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to MKL libraries
+extern int64_t __intel_mkl_feature_indicator;    // CPU feature bits
+extern int64_t __intel_mkl_feature_indicator_x;  // CPU feature bits
+void __intel_mkl_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_mkl_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_mkl_use_fair_dispatch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_mkl_feature_indicator = 0;
+  __intel_mkl_feature_indicator_x = 0;
+  __intel_mkl_features_init_x();
+  __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt
new file mode 100644
index 0000000000..0f891c9ed3
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/readme.txt
@@ -0,0 +1,84 @@
+               intel_dispatch_patch.zip
+               ========================
+
+By Agner Fog, Technical University of Denmark, 2019.
+
+Intel's compilers are generating code that will run slower than necessary when
+the code is executed on a CPU that is not produced by Intel. This has been
+observed with Intel C, C++, and Fortran compilers.
+
+The same happens when certain function libraries produced by Intel are used,
+even if the code is compiled with another compiler, such as Microsoft, Gnu
+or Clang compilers.
+
+This problem is affecting several commonly used software programs such as 
+Matlab, because they are using Intel software libraries.
+
+The library code and the code generated by an Intel compiler may contain
+multiple versions, each optimized for a particular instruction set extension.
+A so-called CPU dispatcher is chosing the optimal version of the code at
+runtime, based on which CPU it is running on.
+
+CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the
+optimal code based only on which instruction set extensions are supported
+by the CPU. An unfair dispatcher first checks the CPU brand. If the brand
+is not Intel, then the unfair dispatcher will chose the "generic" version 
+of the code, i.e. the slowest version that is compatible with old CPUs 
+without the relevant instruction set extensions.
+
+The CPU dispatchers in many Intel function libraries have two versions, a 
+fair and an unfair one. It is not clear when the fair dispatcher is used
+and when the unfair dispatcher is used. My observations about fair and
+unfair CPU dispatching are as follows:
+
+* Code compiled with an Intel compiler will usually have unfair CPU dispatching.
+
+* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives)
+  function libraries from Intel are using the fair CPU dispatcher when used 
+  with a non-Intel compiler.
+
+* The MKL (Math Kernel Library) library contains both fair and unfair
+  dispatchers. It is not clear which dispatcher is used on each function.
+
+The code examples contained herein may be used for circumventing unfair CPU
+dispatching in order to improve compatibility with non-Intel CPUs.
+
+The following files are contained:
+
+intel_cpu_feature_patch.c
+-------------------------
+This code makes sure the fair dispatcher is called instead of the unfair
+one for code generated with an Intel compiler and for general Intel
+function libraries.
+
+intel_mkl_feature_patch.c
+-------------------------
+This does the same for the Intel MKL library.
+
+intel_mkl_cpuid_patch.c
+-----------------------
+This code example is overriding CPU detection functions in Intel's MKL 
+function library. The mkl_serv_intel_cpu() function in MKL is returning
+1 when running on an Intel CPU and 0 when running on any other brand of
+CPU. You may include this code to replace this function in MKL with a
+function that returns 1 regardless of CPU brand.
+
+It may be necessary to use both intel_mkl_feature_patch.c and 
+intel_mkl_cpuid_patch.c when using the MKL library in software that
+may run on any brand of CPU.
+
+An alternative method is to set the environment variable
+   MKL_DEBUG_CPU_TYPE=5
+when running on an AMD processor. This may be useful when you do not have
+access to the source code, for example when running Matlab software.
+
+The patches provided here are based on undocumented features in Intel
+function libraries. Use them at your own risk, and make sure to test your
+code properly to make sure it works as intended.
+
+The most reliable solution is, of course, to avoid Intel compilers and 
+Intel function libraries in code that may run on other CPU brands such
+as AMD and VIA. You may find other function libraries on the web, or 
+you may make your own functions. My vector class library (VCL) is useful
+for making mathematical functions that process multiple data in parallel,
+using the vector processing features of modern CPUs.
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 38bf61e86e..2a4b3d1199 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -16,6 +16,10 @@
 #include <ttg.h>
 #endif
 
+#ifdef IntelMKL_FAIR_DISPATCH
+extern "C" void intel_mkl_use_fair_dispatch();
+#endif
+
 #include <cerrno>
 #include <csignal>
 #include <cstdlib>
@@ -100,6 +104,9 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
     TiledArray::set_default_world(default_world);
 #ifdef TILEDARRAY_HAS_DEVICE
     TiledArray::device_initialize();
+#endif
+#ifdef IntelMKL_FAIR_DISPATCH
+    intel_mkl_use_fair_dispatch();
 #endif
     TiledArray::max_threads = TiledArray::get_num_threads();
     TiledArray::set_num_threads(1);

From 1af2613553a2c8853253921d28c3d7e1b7994a9b Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 2 Apr 2024 11:53:40 -0400
Subject: [PATCH 17/20] introduced duration deque and statistics computation

---
 src/TiledArray/util/time.h | 83 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/src/TiledArray/util/time.h b/src/TiledArray/util/time.h
index aa0639bc0a..8ae649a6af 100644
--- a/src/TiledArray/util/time.h
+++ b/src/TiledArray/util/time.h
@@ -26,7 +26,10 @@
 #ifndef TILEDARRAY_UTIL_TIME_H__INCLUDED
 #define TILEDARRAY_UTIL_TIME_H__INCLUDED
 
+#include <algorithm>
 #include <chrono>
+#include <cmath>
+#include <deque>
 
 namespace TiledArray {
 
@@ -46,6 +49,86 @@ inline int64_t duration_in_ns(time_point const &t0, time_point const &t1) {
   return std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
 }
 
+namespace detail {
+inline std::deque<double> &call_durations_accessor() {
+  static std::deque<double> call_durations;
+  return call_durations;
+}
+}  // namespace detail
+
+/// Access recorded durations
+inline const std::deque<double> &durations() {
+  return detail::call_durations_accessor();
+}
+
+/// Clear recorded durations
+inline void clear_durations() { detail::call_durations_accessor().clear(); }
+
+/// Record duration since the given time point
+/// \param tp_start The start time point
+inline void record_duration_since(const time_point &tp_start) {
+  detail::call_durations_accessor().push_back(duration_in_s(tp_start, now()));
+}
+
+/// Record duration of a single function call
+template <typename F, typename... Args>
+void record_duration(F &&f, Args &&...args) {
+  auto tp_start = now();
+  std::forward<F>(f)(std::forward<Args>(args)...);
+  record_duration_since(tp_start);
+}
+
+/// Statistics of recorded durations
+struct duration_stats_t {
+  double min = 0.0;
+  double max = 0.0;
+  double mean = 0.0;
+  double stddev = 0.0;
+  double median = 0.0;
+  double mean_reciprocal = 0.0;
+};
+
+/// Compute statistics of recorded durations
+/// \return Statistics of recorded durations
+inline duration_stats_t duration_statistics() {
+  duration_stats_t stats;
+  auto &durations = detail::call_durations_accessor();
+  if (durations.empty()) return stats;
+
+  stats.min = durations.front();
+  stats.max = durations.front();
+  stats.mean = durations.front();
+  stats.mean_reciprocal = 1.0 / durations.front();
+  double total = stats.mean;
+  double total_reciprocal = stats.mean_reciprocal;
+  for (size_t i = 1; i < durations.size(); ++i) {
+    total += durations[i];
+    total_reciprocal += 1. / durations[i];
+    stats.min = std::min(stats.min, durations[i]);
+    stats.max = std::max(stats.max, durations[i]);
+  }
+  stats.mean = total / durations.size();
+  stats.mean_reciprocal = total_reciprocal / durations.size();
+
+  double sum_sq = 0.0;
+  for (size_t i = 0; i < durations.size(); ++i) {
+    sum_sq += (durations[i] - stats.mean) * (durations[i] - stats.mean);
+  }
+  stats.stddev =
+      durations.size() > 1 ? std::sqrt(sum_sq / (durations.size() - 1)) : 0.0;
+
+  std::sort(durations.begin(), durations.end());
+  stats.median = durations[durations.size() / 2];
+
+  return stats;
+}
+
 }  // namespace TiledArray
 
+#ifndef TA_RECORD_DURATION
+/// Record duration of a statement
+#define TA_RECORD_DURATION(statement) \
+  TiledArray::record_duration([&] { statement; });
+#endif  // !defined(TA_RECORD_DURATION)
+
 #endif  // TILEDARRAY_UTIL_TIME_H__INCLUDED

From c49adefd5ef40a8a401cc25f997019d9b69c0445 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 2 Apr 2024 11:55:24 -0400
Subject: [PATCH 18/20] examples/dgemm -> examples/gemm + misc cleanup

- cleanup examples to use centralized timers/statistics facilities
- remove dense_new_tile
---
 examples/CMakeLists.txt                       |   2 +-
 examples/dgemm/ta_dense_new_tile.cpp          | 168 ------------------
 examples/{dgemm => gemm}/CMakeLists.txt       |   2 +-
 examples/{dgemm => gemm}/README               |   4 +-
 .../block_size_data_process.py                |   0
 examples/{dgemm => gemm}/block_size_scan.sh   |   0
 examples/{dgemm => gemm}/ta_band.cpp          |  32 ++--
 examples/{dgemm => gemm}/ta_blas.cpp          |  29 ++-
 examples/{dgemm => gemm}/ta_cc_abcd.cpp       |  41 +++--
 examples/{dgemm => gemm}/ta_dense.cpp         |  29 ++-
 examples/{dgemm => gemm}/ta_dense_asymm.cpp   |  28 ++-
 .../{dgemm => gemm}/ta_dense_nonuniform.cpp   |  27 +--
 examples/{dgemm => gemm}/ta_eigen.cpp         |  23 +--
 examples/{dgemm => gemm}/ta_sparse.cpp        |   0
 examples/{dgemm => gemm}/ta_sparse_grow.cpp   |   0
 15 files changed, 99 insertions(+), 286 deletions(-)
 delete mode 100644 examples/dgemm/ta_dense_new_tile.cpp
 rename examples/{dgemm => gemm}/CMakeLists.txt (94%)
 rename examples/{dgemm => gemm}/README (92%)
 rename examples/{dgemm => gemm}/block_size_data_process.py (100%)
 rename examples/{dgemm => gemm}/block_size_scan.sh (100%)
 rename examples/{dgemm => gemm}/ta_band.cpp (86%)
 rename examples/{dgemm => gemm}/ta_blas.cpp (80%)
 rename examples/{dgemm => gemm}/ta_cc_abcd.cpp (93%)
 rename examples/{dgemm => gemm}/ta_dense.cpp (89%)
 rename examples/{dgemm => gemm}/ta_dense_asymm.cpp (92%)
 rename examples/{dgemm => gemm}/ta_dense_nonuniform.cpp (87%)
 rename examples/{dgemm => gemm}/ta_eigen.cpp (76%)
 rename examples/{dgemm => gemm}/ta_sparse.cpp (100%)
 rename examples/{dgemm => gemm}/ta_sparse_grow.cpp (100%)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 99edd4e33b..d240192893 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 # Add Subdirectories
 add_subdirectory (cc)
 add_subdirectory (device)
-add_subdirectory (dgemm)
+add_subdirectory (gemm)
 add_subdirectory (demo)
 add_subdirectory (scalapack)
 add_subdirectory (fock)
diff --git a/examples/dgemm/ta_dense_new_tile.cpp b/examples/dgemm/ta_dense_new_tile.cpp
deleted file mode 100644
index 79dae8a579..0000000000
--- a/examples/dgemm/ta_dense_new_tile.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * This file is a part of TiledArray.
- * Copyright (C) 2013  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#include <TiledArray/version.h>
-#include <tiledarray.h>
-#include <iostream>
-
-using Tile_t = TiledArray::Tile<TiledArray::Tensor<double>>;
-using Array_t = TiledArray::DistArray<Tile_t>;
-
-void set_tiles(double val, Array_t& a) {
-  auto const& trange = a.trange();
-
-  auto pmap = a.pmap();
-  const auto end = pmap->end();
-  for (auto it = pmap->begin(); it != end; ++it) {
-    auto range = trange.make_tile_range(*it);
-    a.set(*it, Tile_t(TiledArray::Tensor<double>(range, val)));
-  }
-}
-
-int main(int argc, char** argv) {
-  int rc = 0;
-
-  try {
-    // Initialize runtime
-    TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv);
-
-    // Get command line arguments
-    if (argc < 2) {
-      std::cout << "Usage: " << argv[0]
-                << " matrix_size block_size [repetitions]\n";
-      return 0;
-    }
-    const long matrix_size = atol(argv[1]);
-    const long block_size = atol(argv[2]);
-    if (matrix_size <= 0) {
-      std::cerr << "Error: matrix size must be greater than zero.\n";
-      return 1;
-    }
-    if (block_size <= 0) {
-      std::cerr << "Error: block size must be greater than zero.\n";
-      return 1;
-    }
-    if ((matrix_size % block_size) != 0ul) {
-      std::cerr << "Error: matrix size must be evenly divisible by block "
-                   "size.\n";
-      return 1;
-    }
-    const long repeat = (argc >= 4 ? atol(argv[3]) : 5);
-    if (repeat <= 0) {
-      std::cerr << "Error: number of repetitions must be greater than zero.\n";
-      return 1;
-    }
-
-    const std::size_t num_blocks = matrix_size / block_size;
-    const std::size_t block_count = num_blocks * num_blocks;
-
-    if (world.rank() == 0)
-      std::cout << "TiledArray: dense matrix multiply test..."
-                << "\nGit description: " << TiledArray::git_description()
-                << "\nNumber of nodes     = " << world.size()
-                << "\nMatrix size         = " << matrix_size << "x"
-                << matrix_size << "\nBlock size          = " << block_size
-                << "x" << block_size << "\nMemory per matrix   = "
-                << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9
-                << " GB\nNumber of blocks    = " << block_count
-                << "\nAverage blocks/node = "
-                << double(block_count) / double(world.size()) << "\n";
-
-    const double flop =
-        2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
-
-    // Construct TiledRange
-    std::vector<unsigned int> blocking;
-    blocking.reserve(num_blocks + 1);
-    for (long i = 0l; i <= matrix_size; i += block_size) blocking.push_back(i);
-
-    std::vector<TiledArray::TiledRange1> blocking2(
-        2, TiledArray::TiledRange1(blocking.begin(), blocking.end()));
-
-    TiledArray::TiledRange trange(blocking2.begin(), blocking2.end());
-
-    // Construct and initialize arrays
-    Array_t a(world, trange);
-    Array_t b(world, trange);
-    Array_t c(world, trange);
-    set_tiles(1.0, a);
-    set_tiles(1.0, b);
-
-    TiledArray::TArrayD a_check(world, trange);
-    TiledArray::TArrayD b_check(world, trange);
-    TiledArray::TArrayD c_check(world, trange);
-    a_check.fill(1.0);
-    b_check.fill(1.0);
-
-    // Start clock
-    world.gop.fence();
-    if (world.rank() == 0)
-      std::cout << "Starting iterations: "
-                << "\n";
-
-    double total_time = 0.0;
-
-    // Do matrix multiplication
-    for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      c_check("m,n") = a_check("m,k") * b_check("k,n");
-      //      world.gop.fence();
-      const double time = madness::wall_time() - start;
-      total_time += time;
-      if (world.rank() == 0)
-        std::cout << "Iteration " << i + 1 << "   time=" << time
-                  << "   GFLOPS=" << flop / time << "\n";
-      auto check_it = c_check.begin();
-      for (auto it = c.begin(); it != c.end() && check_it != c_check.end();
-           ++it, ++check_it) {
-        auto tile_diff = it->get().tensor().subt(check_it->get()).norm();
-        if (tile_diff >= 1e-15) {
-          std::cout << "Tile " << it.ordinal() << " failed test "
-                    << " with norm diff " << tile_diff << std::endl;
-          assert(false);
-        }
-      }
-    }
-
-    // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * flop / total_time << "\n";
-
-  } catch (TiledArray::Exception& e) {
-    std::cerr << "!! TiledArray exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (madness::MadnessException& e) {
-    std::cerr << "!! MADNESS exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (SafeMPI::Exception& e) {
-    std::cerr << "!! SafeMPI exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (std::exception& e) {
-    std::cerr << "!! std exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (...) {
-    std::cerr << "!! exception: unknown exception\n";
-    rc = 1;
-  }
-
-  return rc;
-}
diff --git a/examples/dgemm/CMakeLists.txt b/examples/gemm/CMakeLists.txt
similarity index 94%
rename from examples/dgemm/CMakeLists.txt
rename to examples/gemm/CMakeLists.txt
index 47df67bf36..5808cdec6e 100644
--- a/examples/dgemm/CMakeLists.txt
+++ b/examples/gemm/CMakeLists.txt
@@ -26,7 +26,7 @@
 # Create example executable
 
 foreach(_exec ta_blas ta_eigen ta_band ta_dense ta_sparse ta_dense_nonuniform
-              ta_dense_asymm ta_sparse_grow ta_dense_new_tile
+              ta_dense_asymm ta_sparse_grow
               ta_cc_abcd)
 
   # Add executable
diff --git a/examples/dgemm/README b/examples/gemm/README
similarity index 92%
rename from examples/dgemm/README
rename to examples/gemm/README
index bbb80e88c0..de156f154d 100644
--- a/examples/dgemm/README
+++ b/examples/gemm/README
@@ -12,9 +12,9 @@ Applications usage:
 
   ta_band matrix_size block_size band_width [repetitions]
 
-  blas matrix_size [repetitions]
+  ta_blas matrix_size [repetitions]
 
-  eigen matrix_size [repetitions]
+  ta_eigen matrix_size [repetitions]
 
 Argument definitions:
 
diff --git a/examples/dgemm/block_size_data_process.py b/examples/gemm/block_size_data_process.py
similarity index 100%
rename from examples/dgemm/block_size_data_process.py
rename to examples/gemm/block_size_data_process.py
diff --git a/examples/dgemm/block_size_scan.sh b/examples/gemm/block_size_scan.sh
similarity index 100%
rename from examples/dgemm/block_size_scan.sh
rename to examples/gemm/block_size_scan.sh
diff --git a/examples/dgemm/ta_band.cpp b/examples/gemm/ta_band.cpp
similarity index 86%
rename from examples/dgemm/ta_band.cpp
rename to examples/gemm/ta_band.cpp
index d55550cebd..0743ef734b 100644
--- a/examples/dgemm/ta_band.cpp
+++ b/examples/gemm/ta_band.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
@@ -104,38 +105,33 @@ int main(int argc, char** argv) {
       for (; j < j_end; ++j, ++ij) shape_tensor[ij] = 1.0;
     }
 
-    TiledArray::SparseShape<float> shape(shape_tensor, trange);
+    TiledArray::SparseShape<float> shape(
+        shape_tensor, trange, /* per_element_norms_already = */ true);
 
     // Construct and initialize arrays
     TiledArray::TSpArrayD a(world, trange, shape);
     TiledArray::TSpArrayD b(world, trange, shape);
-    TiledArray::TSpArrayD c(world, trange);
+    TiledArray::TSpArrayD c;
     a.fill(1.0);
     b.fill(1.0);
 
-    // Start clock
-    world.gop.fence();
-    const double wall_time_start = madness::wall_time();
-
     // Do matrix multiplication
+    world.gop.fence();
     for (int i = 0; i < repeat; ++i) {
-      c("m,n") = a("m,k") * b("k,n");
-      world.gop.fence();
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();)
       if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
     }
 
-    // Stop clock
-    const double wall_time_stop = madness::wall_time();
-
     // Print results
-    const long flop = 2.0 * c("m,n").sum().get();
+    const auto gflops_per_call = 2.0 * c("m,n").sum().get() / 1.e9;
     if (world.rank() == 0) {
-      std::cout << "Average wall time = "
-                << (wall_time_stop - wall_time_start) / double(repeat)
-                << "\nAverage GFLOPS = "
-                << double(repeat) * double(flop) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
     }
 
   } catch (TiledArray::Exception& e) {
diff --git a/examples/dgemm/ta_blas.cpp b/examples/gemm/ta_blas.cpp
similarity index 80%
rename from examples/dgemm/ta_blas.cpp
rename to examples/gemm/ta_blas.cpp
index aeefaae908..c97f5bbedc 100644
--- a/examples/dgemm/ta_blas.cpp
+++ b/examples/gemm/ta_blas.cpp
@@ -17,13 +17,14 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
 int main(int argc, char** argv) {
   // Get command line arguments
   if (argc < 2) {
-    std::cout << "Usage: " << argv[0] << " matrix_size [repetitions]\n";
+    std::cout << "Usage: " << argv[0] << " matrix_size [repetitions = 5]\n";
     return 0;
   }
   const long matrix_size = atol(argv[1]);
@@ -66,31 +67,25 @@ int main(int argc, char** argv) {
   const integer m = matrix_size, n = matrix_size, k = matrix_size;
   const integer lda = matrix_size, ldb = matrix_size, ldc = matrix_size;
 
-  // Start clock
-  const double wall_time_start = madness::wall_time();
-
   // Do matrix multiplication
-  // Note: If TiledArray has not been configured with blas, this will be an
-  // eigen call.
   for (int i = 0; i < repeat; ++i) {
-    gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    TA_RECORD_DURATION(
+        gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
   }
-
-  // Stop clock
-  const double wall_time_stop = madness::wall_time();
+  auto durations = TiledArray::duration_statistics();
 
   // Cleanup memory
   free(a);
   free(b);
   free(c);
 
-  std::cout << "Average wall time = "
-            << (wall_time_stop - wall_time_start) / double(repeat)
-            << "\nAverage GFLOPS = "
-            << double(repeat) * 2.0 *
-                   double(matrix_size * matrix_size * matrix_size) /
-                   (wall_time_stop - wall_time_start) / 1.0e9
-            << "\n";
+  const auto gflops_per_call =
+      2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
+  std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = "
+            << gflops_per_call * durations.mean_reciprocal
+            << "\nMedian wall time = " << durations.median
+            << "\nMedian GFLOPS = " << gflops_per_call / durations.median
+            << std::endl;
 
   return 0;
 }
diff --git a/examples/dgemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp
similarity index 93%
rename from examples/dgemm/ta_cc_abcd.cpp
rename to examples/gemm/ta_cc_abcd.cpp
index c1881063d4..df05e78c4a 100644
--- a/examples/dgemm/ta_cc_abcd.cpp
+++ b/examples/gemm/ta_cc_abcd.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -35,13 +36,13 @@ bool to_bool(const char* str) {
 // the last tile absorbs the remainder
 std::vector<unsigned int> make_tiling(unsigned int range_size,
                                       unsigned int ntiles) {
-  const auto average_tile_size = range_size / ntiles;
-  TA_ASSERT(average_tile_size > ntiles);
+  const int average_tile_size = range_size / ntiles;
   std::vector<unsigned int> result(ntiles + 1);
   result[0] = 0;
   for (long t = 0; t != ntiles - 1; ++t) {
-    result[t + 1] =
-        result[t] + average_tile_size + ((t % 2 == 0) ? (t + 1) : (-t));
+    result[t + 1] = result[t] + average_tile_size +
+                    std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
+                             1 - average_tile_size);
   }
   result[ntiles] = range_size;
   return result;
@@ -174,8 +175,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double flops_per_fma =
       (complex_T ? 8 : 2);  // 1 multiply takes 6/1 flops for complex/real
                             // 1 add takes 2/1 flops for complex/real
-  const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
-                         std::pow(n_uocc, 4) / std::pow(1024., 3);
+  const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) *
+                                 std::pow(n_uocc, 4) / std::pow(1024., 3);
 
   // Construct tensors
   TA::TArrayD t2(world, trange_oovv);
@@ -196,13 +197,9 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
     std::cout << "Starting iterations: "
               << "\n";
 
-  double total_time = 0.0;
-  double total_gflop_rate = 0.0;
-
   // Do matrix multiplication
   for (int i = 0; i < repeat; ++i) {
-    const double start = madness::wall_time();
-
+    auto tp_start = TiledArray::now();
     // this is how the user would express this contraction
     if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
 
@@ -223,23 +220,25 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
                   << error("i,j,a,b").squared_norm().get() << std::endl;
       }
     }
+    TiledArray::record_duration_since(tp_start);
 
-    const double stop = madness::wall_time();
-    const double time = stop - start;
-    total_time += time;
-    const double gflop_rate = n_gflop / time;
-    total_gflop_rate += gflop_rate;
+    const double time = TiledArray::durations().back();
+    const double gflop_rate = gflops_per_call / time;
     if (world.rank() == 0)
       std::cout << "Iteration " << i + 1 << "   time=" << time
                 << "   GFLOPS=" << gflop_rate << "\n";
   }
 
   // Print results
-  if (world.rank() == 0)
-    std::cout << "Average wall time   = "
-              << total_time / static_cast<double>(repeat)
-              << " sec\nAverage GFLOPS      = "
-              << total_gflop_rate / static_cast<double>(repeat) << "\n";
+  if (world.rank() == 0) {
+    auto durations = TiledArray::duration_statistics();
+    std::cout << "Average wall time   = " << durations.mean
+              << " s\nAverage GFLOPS      = "
+              << gflops_per_call * durations.mean_reciprocal
+              << "\nMedian wall time   = " << durations.median
+              << " s\nMedian GFLOPS      = "
+              << gflops_per_call / durations.median << "\n";
+  }
 }
 
 template <typename LeftTile, typename RightTile, typename Policy, typename Op>
diff --git a/examples/dgemm/ta_dense.cpp b/examples/gemm/ta_dense.cpp
similarity index 89%
rename from examples/dgemm/ta_dense.cpp
rename to examples/gemm/ta_dense.cpp
index 82506b1d0d..c0ffebd4dc 100644
--- a/examples/dgemm/ta_dense.cpp
+++ b/examples/gemm/ta_dense.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <madness/world/worldmem.h>
 #include <tiledarray.h>
@@ -129,7 +130,7 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange,
 
   const auto n = trange.elements_range().extent()[0];
   const auto complex_T = TiledArray::detail::is_complex<T>::value;
-  const double gflop =
+  const double gflops_per_call =
       (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
                            // 1 add takes 2/1 flops for complex/real
       * double(n * n * n) / 1.0e9;
@@ -168,28 +169,26 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange,
       std::cout << "Starting iterations: "
                 << "\n";
 
-    double total_time = 0.0;
-    double total_gflop_rate = 0.0;
-
     // Do matrix multiplication
     for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      memtrace("c=a*b");
-      const double time = madness::wall_time() - start;
-      total_time += time;
-      const double gflop_rate = gflop / time;
-      total_gflop_rate += gflop_rate;
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");)
+      const auto time = TiledArray::durations().back();
+      const double gflop_rate = gflops_per_call / time;
       if (world.rank() == 0)
         std::cout << "Iteration " << i + 1 << "   time=" << time
                   << "   GFLOPS=" << gflop_rate << "\n";
     }
 
     // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << total_gflop_rate / double(repeat) << "\n";
+    if (world.rank() == 0) {
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
+    }
 
   }  // array lifetime scope
   memtrace("stop");
diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp
similarity index 92%
rename from examples/dgemm/ta_dense_asymm.cpp
rename to examples/gemm/ta_dense_asymm.cpp
index 40183603bb..acef959c7a 100644
--- a/examples/dgemm/ta_dense_asymm.cpp
+++ b/examples/gemm/ta_dense_asymm.cpp
@@ -18,6 +18,7 @@
  */
 
 #include <TiledArray/external/btas.h>
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
@@ -125,11 +126,11 @@ int main(int argc, char** argv) {
     using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;
     using T = TiledArray::detail::numeric_t<Array>;
     const auto complex_T = TiledArray::detail::is_complex_v<T>;
-    const std::int64_t nflops =
+    const double gflops_per_call =
         (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
                              // 1 add takes 2/1 flops for complex/real
         * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
-        static_cast<std::int64_t>(Nk);
+        static_cast<std::int64_t>(Nk) / 1.e9;
 
     if (world.rank() == 0)
       std::cout << "TiledArray: dense matrix multiply test...\n"
@@ -182,18 +183,11 @@ int main(int argc, char** argv) {
         std::cout << "Starting iterations: "
                   << "\n";
 
-      double total_time = 0.0;
-      double total_gflop_rate = 0.0;
-
       // Do matrix multiplication
       for (int i = 0; i < repeat; ++i) {
-        const double start = madness::wall_time();
-        c("m,n") = a("m,k") * b("k,n");
-        memtrace("c=a*b");
-        const double time = madness::wall_time() - start;
-        total_time += time;
-        const double gflop_rate = double(nflops) / (time * 1.e9);
-        total_gflop_rate += gflop_rate;
+        TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");)
+        const double time = TiledArray::durations().back();
+        const double gflop_rate = gflops_per_call / time;
         if (world.rank() == 0)
           std::cout << "Iteration " << i + 1 << "   time=" << time
                     << "   GFLOPS=" << gflop_rate << "\n";
@@ -203,9 +197,13 @@ int main(int argc, char** argv) {
       const double wall_time_stop = madness::wall_time();
 
       if (world.rank() == 0) {
-        std::cout << "Average wall time   = " << total_time / double(repeat)
-                  << " sec\nAverage GFLOPS      = "
-                  << total_gflop_rate / double(repeat) << "\n";
+        auto durations = TiledArray::duration_statistics();
+        std::cout << "Average wall time   = " << durations.mean
+                  << " s\nAverage GFLOPS      = "
+                  << gflops_per_call * durations.mean_reciprocal
+                  << "\nMedian wall time   = " << durations.median
+                  << " s\nMedian GFLOPS      = "
+                  << gflops_per_call / durations.median << "\n";
       }
 
     }  // array lifetime scope
diff --git a/examples/dgemm/ta_dense_nonuniform.cpp b/examples/gemm/ta_dense_nonuniform.cpp
similarity index 87%
rename from examples/dgemm/ta_dense_nonuniform.cpp
rename to examples/gemm/ta_dense_nonuniform.cpp
index c01a4ece11..20e8cce712 100644
--- a/examples/dgemm/ta_dense_nonuniform.cpp
+++ b/examples/gemm/ta_dense_nonuniform.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -58,7 +59,7 @@ int main(int argc, char** argv) {
     const long num_blocks = matrix_size / block_size;
     const long block_count = num_blocks * num_blocks;
 
-    const double flop =
+    const double gflops_per_call =
         2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
 
     // Construct TiledRange
@@ -121,25 +122,25 @@ int main(int argc, char** argv) {
       std::cout << "Starting iterations: "
                 << "\n";
 
-    double total_time = 0.0;
-
     // Do matrix multiplication
     for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      //      world.gop.fence();
-      const double time = madness::wall_time() - start;
-      total_time += time;
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();)
+      const double time = TiledArray::durations().back();
       if (world.rank() == 0)
         std::cout << "Iteration " << i + 1 << "   time=" << time
-                  << "   GFLOPS=" << flop / time << "\n";
+                  << "   GFLOPS=" << gflops_per_call / time << "\n";
     }
 
     // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * flop / total_time << "\n";
+    if (world.rank() == 0) {
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
+    }
 
   } catch (TiledArray::Exception& e) {
     std::cerr << "!! TiledArray exception: " << e.what() << "\n";
diff --git a/examples/dgemm/ta_eigen.cpp b/examples/gemm/ta_eigen.cpp
similarity index 76%
rename from examples/dgemm/ta_eigen.cpp
rename to examples/gemm/ta_eigen.cpp
index 0aa5474cd6..018de9a81f 100644
--- a/examples/dgemm/ta_eigen.cpp
+++ b/examples/gemm/ta_eigen.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
@@ -50,24 +51,16 @@ int main(int argc, char** argv) {
   b.fill(1.0);
   c.fill(0.0);
 
-  // Start clock
-  const double wall_time_start = madness::wall_time();
-
-  // Do matrix multiplcation
+  // Do matrix multiplication
   for (int i = 0; i < repeat; ++i) {
-    c.noalias() = 1.0 * a * b + 0.0 * c;
+    TA_RECORD_DURATION(c.noalias() = 1.0 * a * b + 0.0 * c);
   }
 
-  // Stop clock
-  const double wall_time_stop = madness::wall_time();
-
-  std::cout << "Average wall time = "
-            << (wall_time_stop - wall_time_start) / double(repeat)
-            << "\nAverage GFLOPS = "
-            << double(repeat) * 2.0 *
-                   double(matrix_size * matrix_size * matrix_size) /
-                   (wall_time_stop - wall_time_start) / 1.0e9
-            << "\n";
+  auto durations = TiledArray::duration_statistics();
+  std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = "
+            << (2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9) *
+                   durations.mean_reciprocal
+            << std::endl;
 
   return 0;
 }
diff --git a/examples/dgemm/ta_sparse.cpp b/examples/gemm/ta_sparse.cpp
similarity index 100%
rename from examples/dgemm/ta_sparse.cpp
rename to examples/gemm/ta_sparse.cpp
diff --git a/examples/dgemm/ta_sparse_grow.cpp b/examples/gemm/ta_sparse_grow.cpp
similarity index 100%
rename from examples/dgemm/ta_sparse_grow.cpp
rename to examples/gemm/ta_sparse_grow.cpp

From 1ce2c33dcaa37cd688205aab9679515d0b3e3f6f Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Tue, 2 Apr 2024 17:36:17 -0400
Subject: [PATCH 19/20] fetching eigen3 via ExternalProject_Add's needs to
 specify STAMP_DIR and TMP_DIR just like librett and umpire

---
 external/eigen.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/external/eigen.cmake b/external/eigen.cmake
index f2d28076dd..1489b53fe6 100644
--- a/external/eigen.cmake
+++ b/external/eigen.cmake
@@ -104,7 +104,9 @@ else()
 
   ExternalProject_Add(eigen3
     PREFIX ${CMAKE_INSTALL_PREFIX}
-   #--Download step--------------
+    STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts
+    TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
+    #--Download step--------------
     DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR}
     URL ${EIGEN3_URL}
     URL_HASH ${EIGEN3_URL_HASH}

From 5e22878f2775f711286fcb03b4ddfaf46f089100 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 3 Apr 2024 08:30:25 -0400
Subject: [PATCH 20/20] to avoid premature creation of install directory
 ExternalProject_add will use FETCHCONTENT_BASE_DIR as PREFIX

---
 external/eigen.cmake   | 2 +-
 external/librett.cmake | 2 +-
 external/umpire.cmake  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/eigen.cmake b/external/eigen.cmake
index 1489b53fe6..57bbead90d 100644
--- a/external/eigen.cmake
+++ b/external/eigen.cmake
@@ -103,7 +103,7 @@ else()
   message("** Will build Eigen from ${EIGEN3_URL}")
 
   ExternalProject_Add(eigen3
-    PREFIX ${CMAKE_INSTALL_PREFIX}
+    PREFIX ${FETCHCONTENT_BASE_DIR}
     STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts
     TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
     #--Download step--------------
diff --git a/external/librett.cmake b/external/librett.cmake
index c04cf56b38..afebabb486 100644
--- a/external/librett.cmake
+++ b/external/librett.cmake
@@ -109,7 +109,7 @@ else()
     message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}")
 
     ExternalProject_Add(librett
-            PREFIX ${CMAKE_INSTALL_PREFIX}
+            PREFIX ${FETCHCONTENT_BASE_DIR}
             STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts
             TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------
diff --git a/external/umpire.cmake b/external/umpire.cmake
index c7a02d65bf..57675ca189 100644
--- a/external/umpire.cmake
+++ b/external/umpire.cmake
@@ -163,7 +163,7 @@ else()
     message(STATUS "custom target Umpire is expected to build these byproducts: ${UMPIRE_BUILD_BYPRODUCTS}")
 
     ExternalProject_Add(Umpire
-            PREFIX ${CMAKE_INSTALL_PREFIX}
+            PREFIX ${FETCHCONTENT_BASE_DIR}
             STAMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts
             TMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts   # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------