From 92c6981861f955d7d5cbe2e07d23693e3d8a1bbe Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 09:19:23 -0500 Subject: [PATCH 01/20] [unit] fix range1_suite/constructors + add corner case to range1_suite/accessors --- tests/range1.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/range1.cpp b/tests/range1.cpp index bc2fabdd6c..ba49515cd7 100644 --- a/tests/range1.cpp +++ b/tests/range1.cpp @@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(constructors) { BOOST_CHECK_NO_THROW((Range1{1, 1})); BOOST_CHECK_NO_THROW(Range1(1, 1)); BOOST_CHECK_EQUAL(Range1(1, 1).first, 1); - BOOST_CHECK_EQUAL(Range1(1, 1).first, 1); + BOOST_CHECK_EQUAL(Range1(1, 1).second, 1); BOOST_CHECK_NO_THROW((Range1{-11, 13})); BOOST_CHECK_EQUAL(Range1(-11, 13).first, -11); @@ -86,6 +86,15 @@ BOOST_AUTO_TEST_CASE(accessors) { BOOST_CHECK_EQUAL(r.upbound(), 10); BOOST_CHECK_NO_THROW(r.extent()); BOOST_CHECK_EQUAL(r.extent(), 9); + + // corner case: empty range + Range1 r1{1, 1}; + BOOST_CHECK_NO_THROW(r1.lobound()); + BOOST_CHECK_EQUAL(r1.lobound(), 1); + BOOST_CHECK_NO_THROW(r1.upbound()); + BOOST_CHECK_EQUAL(r1.upbound(), 1); + BOOST_CHECK_NO_THROW(r.extent()); + BOOST_CHECK_EQUAL(r1.extent(), 0); } BOOST_AUTO_TEST_CASE(iteration) { @@ -134,13 +143,13 @@ BOOST_AUTO_TEST_CASE(serialization) { std::size_t buf_size = sizeof(Range1); unsigned char* buf = new unsigned char[buf_size]; madness::archive::BufferOutputArchive oar(buf, buf_size); - oar& r; + oar & r; std::size_t nbyte = oar.size(); oar.close(); Range1 rs; madness::archive::BufferInputArchive iar(buf, nbyte); - iar& rs; + iar & rs; iar.close(); delete[] buf; From f82330fb1a713fa9ac45e5451d4d3badca28d824 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 09:57:25 -0500 Subject: [PATCH 02/20] TiledRange1 supports empty tiles --- src/TiledArray/tiled_range1.h | 20 +++++++++++------- tests/tiled_range1.cpp | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index f1dc2369a8..69e5a5eea3 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -38,9 +38,11 @@ namespace TiledArray { /// TiledRange1 class defines a non-uniformly-tiled, contiguous, one-dimensional /// range. The tiling data is constructed with and stored in an array with -/// the format {a0, a1, a2, ...}, where 0 <= a0 < a1 < a2 < ... Each tile is +/// the format {a0, a1, a2, ...}, where a0 <= a1 <= a2 <= ... Each tile is /// defined as [a0,a1), [a1,a2), ... The number of tiles in the range will be /// equal to one less than the number of elements in the array. +/// \note if TiledArray was configured with `TA_SIGNED_1INDEX_TYPE=OFF` then the +/// tile boundaries must be non-negative. class TiledRange1 { private: struct Enabler {}; @@ -230,6 +232,7 @@ class TiledRange1 { if (!elem2tile_) { init_elem2tile_(); } + // N.B. only track elements in this range return elem2tile_[i - elements_range_.first]; } @@ -312,17 +315,17 @@ class TiledRange1 { TA_ASSERT((std::distance(first, last) >= 2) && "TiledRange1 construction failed: You need at least 2 " "elements in the tile boundary list."); - // Verify the requirement that a0 < a1 < a2 < ... + // Verify the requirement that a0 <= a1 <= a2 <= ... for (; first != (last - 1); ++first) { TA_ASSERT( - *first < *(first + 1) && + *first <= *(first + 1) && "TiledRange1 construction failed: Invalid tile boundary, tile " - "boundary i must be greater than tile boundary i+1 for all i. "); + "boundary i must not be greater than tile boundary i+1 for all i. "); TA_ASSERT( - static_cast(*first) < + static_cast(*first) <= static_cast(*(first + 1)) && "TiledRange1 construction failed: Invalid tile boundary, tile " - "boundary i must be greater than tile boundary i+1 for all i. "); + "boundary i must not be greater than tile boundary i+1 for all i. "); } } @@ -362,9 +365,10 @@ class TiledRange1 { // #endif const auto end = extent(range_); for (index1_type t = 0; t < end; ++t) - for (index1_type e = tiles_ranges_[t].first; - e < tiles_ranges_[t].second; ++e) + for (auto e : tiles_ranges_[t]) { + // only track elements in this range e2t[e - elements_range_.first] = t + range_.first; + } auto e2t_const = std::const_pointer_cast(e2t); // commit the changes std::swap(elem2tile_, e2t_const); diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 043e4b96ac..eb94091e59 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -152,6 +152,32 @@ BOOST_AUTO_TEST_CASE(constructor) { } } + // corner cases + { + // range with 1 empty tile + { + TiledRange1 r{0, 0}; + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 1); + BOOST_CHECK_EQUAL(r.elements_range().first, 0); + BOOST_CHECK_EQUAL(r.elements_range().second, 0); + BOOST_CHECK(r.tile(0) == Range1(0, 0)); + } + // range with some empty tiles + { + TiledRange1 r{1, 3, 3, 5, 5}; + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 4); + BOOST_CHECK_EQUAL(r.elements_range().first, 1); + BOOST_CHECK_EQUAL(r.elements_range().second, 5); + // test tiles + BOOST_CHECK(r.tile(0) == Range1(1, 3)); + BOOST_CHECK(r.tile(1) == Range1(3, 3)); + BOOST_CHECK(r.tile(2) == Range1(3, 5)); + BOOST_CHECK(r.tile(3) == Range1(5, 5)); + } + } + // Check that invalid input throws an exception. #ifndef NDEBUG { @@ -195,6 +221,20 @@ BOOST_AUTO_TEST_CASE(element_to_tile) { // Check that the expected and internal element to tile maps match. BOOST_CHECK_EQUAL_COLLECTIONS(c.begin(), c.end(), e.begin(), e.end()); + + // corner case: empty tiles + { + // range with some empty tiles + { + TiledRange1 r{1, 3, 3, 5, 5}; + BOOST_CHECK_TA_ASSERT(r.element_to_tile(0), Exception); + BOOST_CHECK_EQUAL(r.element_to_tile(1), 0); + BOOST_CHECK_EQUAL(r.element_to_tile(2), 0); + BOOST_CHECK_EQUAL(r.element_to_tile(3), 2); + BOOST_CHECK_EQUAL(r.element_to_tile(4), 2); + BOOST_CHECK_TA_ASSERT(r.element_to_tile(5), Exception); + } + } } BOOST_AUTO_TEST_CASE(comparison) { From 03b67de05e2062d1fa80ac0a5012899306f29e4a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 10:03:55 -0500 Subject: [PATCH 03/20] [unit] tiled_range_suite/constructor: test corner case with TiledRange1 composed of empty tiles only --- tests/tiled_range.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 14e47e3557..76702831a3 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -58,6 +58,14 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(r1.elements_range().area(), 0); } + // construct with ranges containing empty tiles only + { + BOOST_REQUIRE_NO_THROW(TiledRange r1({dims[0], TiledRange1{1, 1, 1}})); + TiledRange r1{dims[0], TiledRange1{1, 1, 1}}; + BOOST_CHECK_EQUAL(r1.tiles_range().area(), dims[0].tile_extent() * 2); + BOOST_CHECK_EQUAL(r1.elements_range().area(), 0); + } + // check initializer list of initializer list constructor { TiledRange r1{ From c3f8b80b7d5c44b3a0b17915c2d8257177fcdd19 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 10:05:13 -0500 Subject: [PATCH 04/20] dox typo --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 2a658dc886..8795e699c6 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -350,7 +350,7 @@ class ContEngine : public BinaryEngine { left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); - // Initialize the process map in not already defined + // Initialize the process map if not already defined if (!pmap) pmap = proc_grid_.make_pmap(); ExprEngine_::init_distribution(world, pmap); } From a586733320f9ad216ef67cbe9b913153df5683df Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:06:58 -0500 Subject: [PATCH 05/20] SparseShape::sparsity() returns 0 for zero-volume ranges --- src/TiledArray/sparse_shape.h | 19 +-- tests/sparse_shape.cpp | 222 +++++++++++++++++++++++----------- 2 files changed, 161 insertions(+), 80 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index b589dc73cf..f7cc9355f7 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -516,10 +516,13 @@ class SparseShape { /// Sparsity of the shape - /// \return The fraction of tiles that are zero. + /// \return The fraction of tiles that are zero. Always returns 0 if + /// `this->data().size()` is zero. float sparsity() const { TA_ASSERT(!tile_norms_.empty()); - return float(zero_tile_count_) / float(tile_norms_.size()); + return tile_norms_.size() != 0 + ? float(zero_tile_count_) / float(tile_norms_.size()) + : 0.f; } // clang-format off @@ -1679,23 +1682,23 @@ class SparseShape { typename std::enable_if>>::type* = nullptr> void serialize(Archive& ar) { - ar& tile_norms_; + ar & tile_norms_; const unsigned int dim = tile_norms_.range().rank(); // allocate size_vectors_ size_vectors_ = std::move(std::shared_ptr( new vector_type[dim], std::default_delete())); - for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d]; - ar& zero_tile_count_; + for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d]; + ar & zero_tile_count_; } template >>::type* = nullptr> void serialize(Archive& ar) const { - ar& tile_norms_; + ar & tile_norms_; const unsigned int dim = tile_norms_.range().rank(); - for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d]; - ar& zero_tile_count_; + for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d]; + ar & zero_tile_count_; } private: diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index 0112f0dac6..64116dd687 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -121,9 +121,12 @@ BOOST_AUTO_TEST_CASE(non_comm_constructor) { } } - BOOST_CHECK_CLOSE(x.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + x.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK(x.nnz() == x.data().size() - zero_tile_count); // use the sparse ctor @@ -194,9 +197,12 @@ BOOST_AUTO_TEST_CASE(comm_constructor) { } } - BOOST_CHECK_CLOSE(x.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + x.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK_EQUAL(x.nnz(), x.data().size() - zero_tile_count); // use the sparse ctor @@ -321,7 +327,9 @@ BOOST_AUTO_TEST_CASE(block) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -413,7 +421,9 @@ BOOST_AUTO_TEST_CASE(block_scale) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -513,7 +523,9 @@ BOOST_AUTO_TEST_CASE(block_perm) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -614,7 +626,9 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -706,9 +720,12 @@ BOOST_AUTO_TEST_CASE(transform) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mask) { @@ -745,9 +762,12 @@ BOOST_AUTO_TEST_CASE(mask) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(scale) { @@ -778,9 +798,12 @@ BOOST_AUTO_TEST_CASE(scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(scale_perm) { @@ -812,9 +835,12 @@ BOOST_AUTO_TEST_CASE(scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add) { @@ -848,9 +874,12 @@ BOOST_AUTO_TEST_CASE(add) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK_EQUAL(result.nnz(), result.data().size() - zero_tile_count); } @@ -885,9 +914,12 @@ BOOST_AUTO_TEST_CASE(add_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_perm) { @@ -922,9 +954,12 @@ BOOST_AUTO_TEST_CASE(add_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_scale_perm) { @@ -959,9 +994,12 @@ BOOST_AUTO_TEST_CASE(add_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_const) { @@ -998,9 +1036,12 @@ BOOST_AUTO_TEST_CASE(add_const) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_const_perm) { @@ -1037,9 +1078,12 @@ BOOST_AUTO_TEST_CASE(add_const_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt) { @@ -1073,9 +1117,12 @@ BOOST_AUTO_TEST_CASE(subt) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_scale) { @@ -1109,9 +1156,12 @@ BOOST_AUTO_TEST_CASE(subt_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_perm) { @@ -1146,9 +1196,12 @@ BOOST_AUTO_TEST_CASE(subt_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_scale_perm) { @@ -1183,9 +1236,12 @@ BOOST_AUTO_TEST_CASE(subt_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_const) { @@ -1220,9 +1276,12 @@ BOOST_AUTO_TEST_CASE(subt_const) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_const_perm) { @@ -1260,9 +1319,12 @@ BOOST_AUTO_TEST_CASE(subt_const_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult) { @@ -1295,9 +1357,12 @@ BOOST_AUTO_TEST_CASE(mult) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_scale) { @@ -1330,9 +1395,12 @@ BOOST_AUTO_TEST_CASE(mult_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_perm) { @@ -1368,9 +1436,12 @@ BOOST_AUTO_TEST_CASE(mult_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_scale_perm) { @@ -1406,9 +1477,12 @@ BOOST_AUTO_TEST_CASE(mult_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(gemm) { @@ -1470,7 +1544,9 @@ BOOST_AUTO_TEST_CASE(gemm) { } BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(result_norms.size()), + result_norms.size() > 0 + ? float(zero_tile_count) / float(result_norms.size()) + : 0, tolerance); } @@ -1538,7 +1614,9 @@ BOOST_AUTO_TEST_CASE(gemm_perm) { } BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(result_norms.size()), + result_norms.size() > 0 + ? float(zero_tile_count) / float(result_norms.size()) + : 0, tolerance); } From e05d908469e738469ff2209ad650e221010099a6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:07:42 -0500 Subject: [PATCH 06/20] detail::permute works for zero-volume result/arg --- src/TiledArray/tensor/permute.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h index 43fbfc9328..4d46907172 100644 --- a/src/TiledArray/tensor/permute.h +++ b/src/TiledArray/tensor/permute.h @@ -127,6 +127,9 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, const unsigned int ndim1 = ndim - 1; const auto volume = arg0.range().volume(); + // handle the corner case of empty result/args + if (volume == 0) return; + // Get pointer to arg extent const auto* MADNESS_RESTRICT const arg0_extent = arg0.range().extent_data(); From ae41f770241a5e4a3a97b424c6b002b7e9128fdb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 15:52:19 -0500 Subject: [PATCH 07/20] BlockRange extended for zero-volume Range --- src/TiledArray/block_range.h | 27 +++++++++++++++++---------- tests/block_range.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/block_range.h b/src/TiledArray/block_range.h index 08096c96ea..06f8ecd629 100644 --- a/src/TiledArray/block_range.h +++ b/src/TiledArray/block_range.h @@ -85,7 +85,7 @@ class BlockRange : public Range { upper[d] = upper_bound_d; // Check input dimensions TA_ASSERT(lower[d] >= range.lobound(d)); - TA_ASSERT(lower[d] < upper[d]); + TA_ASSERT(lower[d] <= upper[d]); TA_ASSERT(upper[d] <= range.upbound(d)); extent[d] = upper[d] - lower[d]; TA_ASSERT(extent[d] == @@ -132,7 +132,7 @@ class BlockRange : public Range { upper[d] = upper_bound_d; // Check input dimensions TA_ASSERT(lower[d] >= range.lobound(d)); - TA_ASSERT(lower[d] < upper[d]); + TA_ASSERT(lower[d] <= upper[d]); TA_ASSERT(upper[d] <= range.upbound(d)); extent[d] = upper[d] - lower[d]; TA_ASSERT(extent[d] == @@ -177,9 +177,10 @@ class BlockRange : public Range { /// \param range the host Range /// \param lower_bound A sequence of lower bounds for each dimension /// \param upper_bound A sequence of upper bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported /// \throw TiledArray::Exception When the size of \p lower_bound is not /// equal to that of \p upper_bound. - /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]` + /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]` // clang-format on template && @@ -204,9 +205,10 @@ class BlockRange : public Range { /// \param range the host Range /// \param lower_bound An initializer list of lower bounds for each dimension /// \param upper_bound An initializer list of upper bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported /// \throw TiledArray::Exception When the size of \p lower_bound is not /// equal to that of \p upper_bound. - /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]` + /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]` // clang-format on template && @@ -247,7 +249,8 @@ class BlockRange : public Range { /// \endcode /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v ) /// \param bounds A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bounds[i].lower>=bounds[i].upper` for any \c i . + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template >> @@ -264,8 +267,9 @@ class BlockRange : public Range { /// BlockRange br0(r, {std::make_pair(0,4), std::pair{1,6}, std::pair(2,8)}); /// \endcode /// \tparam GPair a generalized pair of integral types - /// \param bound A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i . + /// \param bounds A range of {lower,upper} bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template BlockRange(const Range& range, const std::initializer_list& bounds, @@ -290,8 +294,9 @@ class BlockRange : public Range { /// BlockRange br0(r, {{0,4}, {1,6}, {2,8}}); /// \endcode /// \tparam Index An integral type - /// \param bound A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i . + /// \param bounds A range of {lower,upper} bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template >> @@ -354,6 +359,8 @@ class BlockRange : public Range { /// \return The ordinal index in the /// \throw TiledArray::Exception When \c index is not included in this range ordinal_type ordinal(ordinal_type ord) const { + // ordinals are useless for zero-volume ranges + TA_ASSERT(volume() != 0); // Check that ord is contained by this range. TA_ASSERT(Range::includes_ordinal(ord)); @@ -414,7 +421,7 @@ class BlockRange : public Range { template void serialize(Archive& ar) const { Range::serialize(ar); - ar& block_offset_; + ar & block_offset_; } }; // BlockRange diff --git a/tests/block_range.cpp b/tests/block_range.cpp index 135c36d0b4..5d8431fa41 100644 --- a/tests/block_range.cpp +++ b/tests/block_range.cpp @@ -72,7 +72,7 @@ BOOST_AUTO_TEST_CASE(block_zero_lower_bound) { for (unsigned int i = 0u; i < upper.size(); ++i) ++(upper[i]); if (std::equal(lower.begin(), lower.end(), upper.begin(), - [](std::size_t l, std::size_t r0) { return l < r0; })) { + [](std::size_t l, std::size_t r0) { return l <= r0; })) { if (count_valid == target_count) continue; ++count_valid; @@ -141,7 +141,7 @@ BOOST_AUTO_TEST_CASE(block) { for (unsigned int i = 0u; i < r.rank(); ++i) ++(upper[i]); if (std::equal(lower.begin(), lower.end(), upper.begin(), - [](std::size_t l, std::size_t r) { return l < r; })) { + [](std::size_t l, std::size_t r) { return l <= r; })) { if (count_valid == target_count) continue; ++count_valid; @@ -269,4 +269,26 @@ BOOST_AUTO_TEST_CASE(block) { end:; } +BOOST_AUTO_TEST_CASE(empty_trange1) { + using TiledArray::eigen::iv; + // host range is non-empty but one of the dimensions will have no tiles + { + BOOST_CHECK_NO_THROW(BlockRange(r, iv(3, 3, 3), iv(4, 3, 5))); + BlockRange br(r, iv(3, 3, 3), iv(4, 3, 5)); + BOOST_CHECK_EQUAL(br.volume(), 0); + BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception); + } + + // host range is non-empty but one of the dimensions will have no tiles + { + BOOST_CHECK_NO_THROW( + BlockRange(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0), + iv(1, 0, 1))); + BlockRange br(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0), + iv(1, 0, 1)); + BOOST_CHECK_EQUAL(br.volume(), 0); + BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception); + } +} + BOOST_AUTO_TEST_SUITE_END() From 463f6a75254dcc325e9b0a37c21f84a3723dfd0f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 16:47:08 -0500 Subject: [PATCH 08/20] SparseShape::block() supports zero-volume blocks --- src/TiledArray/sparse_shape.h | 4 ++-- tests/sparse_shape.cpp | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index f7cc9355f7..a7df1c520c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -840,7 +840,7 @@ class SparseShape { // Check that the input indices are in range TA_ASSERT(lower_d >= tile_norms_.range().lobound(d)); - TA_ASSERT(lower_d < upper_d); + TA_ASSERT(lower_d <= upper_d); TA_ASSERT(upper_d <= tile_norms_.range().upbound(d)); // Construct the size vector for rank i @@ -874,7 +874,7 @@ class SparseShape { // Check that the input indices are in range TA_ASSERT(lower_d >= tile_norms_.range().lobound(d)); - TA_ASSERT(lower_d < upper_d); + TA_ASSERT(lower_d <= upper_d); TA_ASSERT(upper_d <= tile_norms_.range().upbound(d)); // Construct the size vector for rank i diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index 64116dd687..a79d7ceb8e 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -276,7 +276,7 @@ BOOST_AUTO_TEST_CASE(block) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); for (auto lower_it = tr.tiles_range().begin(); lower_it != tr.tiles_range().end(); ++lower_it) { @@ -287,7 +287,7 @@ BOOST_AUTO_TEST_CASE(block) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper)); @@ -369,7 +369,7 @@ BOOST_AUTO_TEST_CASE(block_scale) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const float factor = 3.3; for (auto lower_it = tr.tiles_range().begin(); @@ -381,7 +381,7 @@ BOOST_AUTO_TEST_CASE(block_scale) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = @@ -468,7 +468,7 @@ BOOST_AUTO_TEST_CASE(block_perm) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const auto inv_perm = perm.inv(); for (auto lower_it = tr.tiles_range().begin(); @@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE(block_perm) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper, perm)); @@ -569,7 +569,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const float factor = 3.3; const auto inv_perm = perm.inv(); @@ -582,7 +582,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW( From a8d39125da85229a57444b62be9df0b7997c41e0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:15:51 -0500 Subject: [PATCH 09/20] [unit] expressions suite: tensor_factories tests DistArrays with empty trange1 --- src/TiledArray/expressions/blk_tsr_engine.h | 81 ++++++++++++++------- src/TiledArray/expressions/blk_tsr_expr.h | 2 +- tests/expressions_fixture.h | 12 ++- tests/expressions_impl.h | 26 +++++++ 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index 2d16172dbe..31ad29ee74 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -194,16 +194,19 @@ class BlkTsrEngineBase : public LeafEngine { const auto lower_d = lower[d]; const auto upper_d = upper[d]; - // Copy and shift the tiling for the block - auto i = lower_d; - const auto base_d = trange[d].tile(i).first; - trange1_data.emplace_back(0ul); - for (; i < upper_d; ++i) - trange1_data.emplace_back(trange[d].tile(i).second - base_d); - - // Add the trange1 to the tiled range data - trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); - trange1_data.resize(0ul); + // Copy and shift the tiling for the block, if nonempty + if (lower_d != upper_d) { + auto i = lower_d; + const auto base_d = trange[d].tile(i).first; + trange1_data.emplace_back(0ul); + for (; i < upper_d; ++i) + trange1_data.emplace_back(trange[d].tile(i).second - base_d); + // Add the trange1 to the tiled range data + trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); + trange1_data.resize(0ul); + } else { + trange_data.emplace_back(); + } } return TiledRange(trange_data.begin(), trange_data.end()); @@ -233,16 +236,19 @@ class BlkTsrEngineBase : public LeafEngine { const auto lower_i = lower[inv_perm_d]; const auto upper_i = upper[inv_perm_d]; - // Copy, shift, and permute the tiling of the block - auto i = lower_i; - const auto base_d = trange[inv_perm_d].tile(i).first; - trange1_data.emplace_back(0ul); - for (; i < upper_i; ++i) - trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); - - // Add the trange1 to the tiled range data - trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); - trange1_data.resize(0ul); + if (lower_i != upper_i) { + // Copy, shift, and permute the tiling of the block + auto i = lower_i; + const auto base_d = trange[inv_perm_d].tile(i).first; + trange1_data.emplace_back(0ul); + for (; i < upper_i; ++i) + trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); + + // Add the trange1 to the tiled range data + trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); + trange1_data.resize(0ul); + } else + trange_data.emplace_back(); } return TiledRange(trange_data.begin(), trange_data.end()); @@ -376,12 +382,18 @@ class BlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the range shift vector for (unsigned int d = 0u; d < rank; ++d) { const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift.emplace_back(-base_d); + } else { + range_shift.emplace_back(0l); + } } return op_type(op_base_type(range_shift)); @@ -402,6 +414,7 @@ class BlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the permuted range shift vector auto outer_perm = outer(perm); @@ -409,8 +422,11 @@ class BlkTsrEngine for (unsigned int d = 0u; d < rank; ++d) { const auto perm_d = outer_perm[d]; const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift[perm_d] = -base_d; + } } return op_type(op_base_type(range_shift), perm); @@ -522,12 +538,17 @@ class ScalBlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Construct the inverse permutation for (unsigned int d = 0u; d < rank; ++d) { const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift.emplace_back(-base_d); + } else + range_shift.emplace_back(0); } return op_type(op_base_type(range_shift, factor_)); @@ -548,6 +569,7 @@ class ScalBlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the permuted range shift vector auto outer_perm = outer(perm); @@ -555,8 +577,11 @@ class ScalBlkTsrEngine for (unsigned int d = 0u; d < rank; ++d) { const auto perm_d = outer_perm[d]; const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift[perm_d] = -base_d; + } } return op_type(op_base_type(range_shift, factor_), perm); diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h index d32603b58f..5d6612d5cc 100644 --- a/src/TiledArray/expressions/blk_tsr_expr.h +++ b/src/TiledArray/expressions/blk_tsr_expr.h @@ -179,7 +179,7 @@ class BlkTsrExprBase : public Expr { const bool lower_upper_bound_check = std::equal(std::begin(lower_bound_), std::end(lower_bound_), std::begin(upper_bound_), - [](std::size_t l, std::size_t r) { return l < r; }); + [](std::size_t l, std::size_t r) { return l <= r; }); if (!lower_upper_bound_check) { if (TiledArray::get_default_world().rank() == 0) { using TiledArray::operator<<; diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index 3e493b1200..a4f7e4cd0b 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -62,9 +62,11 @@ struct ExpressionsFixture : public TiledRangeFixture { s_tr1_1(make_random_sparseshape(trange1)), s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), + s_trC(make_random_sparseshape(trangeC)), a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), + aC(*GlobalFixture::world, trangeC, s_trC), u(*GlobalFixture::world, trange1, s_tr1_1), v(*GlobalFixture::world, trange1, s_tr1_2), w(*GlobalFixture::world, trange2, s_tr2) { @@ -72,6 +74,7 @@ struct ExpressionsFixture : public TiledRangeFixture { random_fill(b); random_fill(u); random_fill(v); + random_fill(aC); GlobalFixture::world->gop.fence(); a.truncate(); b.truncate(); @@ -88,11 +91,13 @@ struct ExpressionsFixture : public TiledRangeFixture { c(*GlobalFixture::world, tr), u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), - w(*GlobalFixture::world, trange2) { + w(*GlobalFixture::world, trange2), + aC(*GlobalFixture::world, trangeC) { random_fill(a); random_fill(b); random_fill(u); random_fill(v); + random_fill(aC); GlobalFixture::world->gop.fence(); } @@ -213,17 +218,22 @@ struct ExpressionsFixture : public TiledRangeFixture { const TiledRange trange1{{0, 2, 5, 10, 17, 28, 41}}; const TiledRange trange2{{0, 2, 5, 10, 17, 28, 41}, {0, 3, 6, 11, 18, 29, 42}}; + // contains empty trange1 + const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{}, + TiledRange1{0, 2, 7, 11}}; SparseShape s_tr_1; SparseShape s_tr_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; + SparseShape s_trC; TArray a; TArray b; TArray c; TArray u; TArray v; TArray w; + TArray aC; }; // ExpressionsFixture #endif // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 91bcb10cc4..6fdfc2ce0e 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -31,6 +31,7 @@ constexpr int nrepeats = 5; BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { auto& a = F::a; auto& c = F::c; + auto& aC = F::aC; const auto& ca = a; const std::array lobound{{3, 3, 3}}; @@ -2941,6 +2942,31 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) { BOOST_CHECK_EQUAL(result, expected); } +// corner case: expressions involving array with empty trange1 +BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { + auto& c = F::c; + auto& aC = F::aC; + + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + + using TiledArray::eigen::iv; + const std::array lobound{{0, 0, 1}}; + const std::array upbound{{1, 0, 2}}; + + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); + BOOST_CHECK_NO_THROW(c("a,b,c") += + 2 * aC("a,b,c").block(lobound, upbound).conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = + 2 * conj(aC("a,c,b").block(lobound, upbound))); +} + BOOST_AUTO_TEST_SUITE_END() #endif // TILEDARRAY_TEST_EXPRESSIONS_IMPL_H From df7e0c804dfa6f5901fa4e6bedbeb1993e2a5286 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 23:46:31 -0500 Subject: [PATCH 10/20] GEMM ld{a,b,c} must be >= 1 see https://netlib.org/lapack/explore-html/dd/d09/group__gemm_ga1e899f8453bcbfde78e91a86a2dab984.html#ga1e899f8453bcbfde78e91a86a2dab984 --- src/TiledArray/device/btas.h | 26 ++++++++++++++++---------- src/TiledArray/external/btas.h | 26 ++++++++++++++++---------- src/TiledArray/tensor/tensor.h | 14 +++++++++----- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h index acd42341fd..b30fdd4edd 100644 --- a/src/TiledArray/device/btas.h +++ b/src/TiledArray/device/btas.h @@ -77,10 +77,12 @@ ::btas::Tensor gemm( gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t = T(factor); T zero(0); @@ -112,10 +114,11 @@ ::btas::Tensor gemm( static_assert(::btas::boxrange_iteration_order::value == ::btas::boxrange_iteration_order::row_major); + const integer ldc = std::max(integer{1}, n); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), - lda, zero, device_data(result.storage()), n, queue); + lda, zero, device_data(result.storage()), ldc, queue); device::sync_madness_task_with(stream); } @@ -185,10 +188,12 @@ void gemm(::btas::Tensor &result, gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); auto &queue = blasqueue_for(result.range()); const auto stream = device::Stream(queue.device(), queue.stream()); @@ -207,10 +212,11 @@ void gemm(::btas::Tensor &result, static_assert(::btas::boxrange_iteration_order::value == ::btas::boxrange_iteration_order::row_major); + const integer ldc = std::max(integer{1}, n); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), - lda, one, device_data(result.storage()), n, queue); + lda, one, device_data(result.storage()), ldc, queue); device::sync_madness_task_with(stream); } } diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index 11971c269e..fe84e6f0c6 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -661,16 +661,19 @@ inline btas::Tensor gemm( gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t(factor); + const integer ldc = std::max(integer{1}, n); TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, factor_t, left.data(), lda, right.data(), - ldb, T(0), result.data(), n); + ldb, T(0), result.data(), ldc); return result; } @@ -736,16 +739,19 @@ inline void gemm(btas::Tensor& result, gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t(factor); + const integer ldc = std::max(integer{1}, n); TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, factor_t, left.data(), lda, right.data(), - ldb, T(1), result.data(), n); + ldb, T(1), result.data(), ldc); } // sum of the hyperdiagonal elements diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 9c36b071cc..c12c2c15d1 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2648,10 +2648,13 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n + : k)); // may need to split gemm into multiply + accumulate for tracing purposes #ifdef TA_ENABLE_TILE_OPS_LOGGING @@ -2719,8 +2722,9 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, } } #else // TA_ENABLE_TILE_OPS_LOGGING + const integer ldc = std::max(integer{1}, n); math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, - alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n); + alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc); #endif // TA_ENABLE_TILE_OPS_LOGGING } } From e1883fecc61ecc9ff4cb341cc18d374cd8763a45 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 00:01:53 -0500 Subject: [PATCH 11/20] [unit] expression suite: can contract + reduce zero-volume DistArrays --- src/TiledArray/dist_eval/contraction_eval.h | 8 ++-- src/TiledArray/pmap/cyclic_pmap.h | 4 -- src/TiledArray/proc_grid.h | 6 --- tests/cyclic_pmap.cpp | 6 --- tests/expressions_impl.h | 50 +++++++++++++++------ 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 2da66628fc..4755538689 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -891,9 +891,9 @@ class Summa ordinal_type initialize(const DenseShape&) { // Construct static broadcast groups for dense arguments const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); - col_group_ = proc_grid_.make_col_group(col_did); + if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did); const madness::DistributedID row_did(DistEvalImpl_::id(), k_); - row_group_ = proc_grid_.make_row_group(row_did); + if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE std::stringstream ss; @@ -1347,7 +1347,6 @@ class Summa template void make_next_step_tasks(Derived* task, ordinal_type depth) { - TA_ASSERT(depth > 0); // Set the depth to be no greater than the maximum number steps if (depth > owner_->k_) depth = owner_->k_; @@ -1706,6 +1705,9 @@ class Summa std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols())); + // corner case: empty result + if (k_ == 0) return 0; + // Construct the first SUMMA iteration task if (TensorImpl_::shape().is_dense()) { // We cannot have more iterations than there are blocks in the k diff --git a/src/TiledArray/pmap/cyclic_pmap.h b/src/TiledArray/pmap/cyclic_pmap.h index 6d2df0088b..250b4f677b 100644 --- a/src/TiledArray/pmap/cyclic_pmap.h +++ b/src/TiledArray/pmap/cyclic_pmap.h @@ -84,10 +84,6 @@ class CyclicPmap : public Pmap { cols_(cols), proc_cols_(proc_cols), proc_rows_(proc_rows) { - // Check that the size is non-zero - TA_ASSERT(rows_ >= 1ul); - TA_ASSERT(cols_ >= 1ul); - // Check limits of process rows and columns TA_ASSERT(proc_rows_ >= 1ul); TA_ASSERT(proc_cols_ >= 1ul); diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h index a401e0ac1e..cd15c1b73e 100644 --- a/src/TiledArray/proc_grid.h +++ b/src/TiledArray/proc_grid.h @@ -288,12 +288,6 @@ class ProcGrid { local_rows_(0ul), local_cols_(0ul), local_size_(0ul) { - // Check for non-zero sizes - TA_ASSERT(rows_ >= 1u); - TA_ASSERT(cols_ >= 1u); - TA_ASSERT(row_size >= 1ul); - TA_ASSERT(col_size >= 1ul); - init(world_->rank(), world_->size(), row_size, col_size); } diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp index 4d8d76da1f..b8c2b9670c 100644 --- a/tests/cyclic_pmap.cpp +++ b/tests/cyclic_pmap.cpp @@ -60,12 +60,6 @@ BOOST_AUTO_TEST_CASE(constructor) { ProcessID size = GlobalFixture::world->size(); - BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 0ul, 10ul, 1, 1), - TiledArray::Exception); - BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 10ul, 0ul, 1, 1), - TiledArray::Exception); BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( *GlobalFixture::world, 10ul, 10ul, 0, 1), TiledArray::Exception); diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 6fdfc2ce0e..ea4beab3d6 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -2947,24 +2947,48 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { auto& c = F::c; auto& aC = F::aC; - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); - BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + // unary/binary expressions + { + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + } using TiledArray::eigen::iv; const std::array lobound{{0, 0, 1}}; const std::array upbound{{1, 0, 2}}; - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); - BOOST_CHECK_NO_THROW(c("a,b,c") += - 2 * aC("a,b,c").block(lobound, upbound).conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") = - 2 * conj(aC("a,c,b").block(lobound, upbound))); + // unary/binary block expressions + { + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); + BOOST_CHECK_NO_THROW(c("a,b,c") += + 2 * aC("a,b,c").block(lobound, upbound).conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = + 2 * conj(aC("a,c,b").block(lobound, upbound))); + } + + // contraction expressions + { + std::decay_t t2, t4; + // contraction over empty dim + BOOST_CHECK_NO_THROW(t4("a,c,e,d") = aC("a,b,c") * aC("d,b,e")); + // contraction over empty and nonempty dims + BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c")); + // contraction over nonempty dims + BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c")); + } + + // reduction expressions + { + // contraction over empty dim + BOOST_CHECK_NO_THROW(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get()); + BOOST_CHECK_EQUAL(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get(), 0); + } } BOOST_AUTO_TEST_SUITE_END() From fa830a183178a68013e42abc6ecda3f329eff4e8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 09:24:02 -0500 Subject: [PATCH 12/20] expression suite: fixups for contractions w/ zero-volume contraction range and w/ nonzero-volume contraction range producing zero-volume result --- src/TiledArray/dist_eval/contraction_eval.h | 184 ++++++++++++-------- src/TiledArray/expressions/cont_engine.h | 30 ++-- tests/expressions_fixture.h | 12 +- tests/expressions_impl.h | 3 + 4 files changed, 146 insertions(+), 83 deletions(-) diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 4755538689..a747c0748b 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -889,45 +889,63 @@ class Summa /// Initialize reduce tasks and construct broadcast groups ordinal_type initialize(const DenseShape&) { - // Construct static broadcast groups for dense arguments - const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); - if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did); - const madness::DistributedID row_did(DistEvalImpl_::id(), k_); - if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did); + // if contraction is over zero-volume range just initialize tiles to zero + if (k_ == 0) { + ordinal_type tile_count = 0; + const auto& tiles_range = this->trange().tiles_range(); + for (auto&& tile_idx : tiles_range) { + auto tile_ord = tiles_range.ordinal(tile_idx); + if (this->is_local(tile_ord)) { + this->world().taskq.add([this, tile_ord, tile_idx]() { + this->set_tile(tile_ord, + value_type(this->trange().tile(tile_idx), + typename value_type::value_type{})); + }); + ++tile_count; + } + } + return tile_count; + } else { + // Construct static broadcast groups for dense arguments + const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); + col_group_ = proc_grid_.make_col_group(col_did); + const madness::DistributedID row_did(DistEvalImpl_::id(), k_); + row_group_ = proc_grid_.make_row_group(row_did); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - std::stringstream ss; - ss << "init: rank=" << TensorImpl_::world().rank() << "\n col_group_=(" - << col_did.first << ", " << col_did.second << ") { "; - for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc) - ss << col_group_.world_rank(gproc) << " "; - ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second - << ") { "; - for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc) - ss << row_group_.world_rank(gproc) << " "; - ss << "}\n"; - printf(ss.str().c_str()); + std::stringstream ss; + ss << "init: rank=" << TensorImpl_::world().rank() << "\n col_group_=(" + << col_did.first << ", " << col_did.second << ") { "; + for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc) + ss << col_group_.world_rank(gproc) << " "; + ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second + << ") { "; + for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc) + ss << row_group_.world_rank(gproc) << " "; + ss << "}\n"; + printf(ss.str().c_str()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - // Allocate memory for the reduce pair tasks. - std::allocator> alloc; - reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); + // Allocate memory for the reduce pair tasks. + std::allocator> alloc; + reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); - // Iterate over all local tiles - const ordinal_type n = proc_grid_.local_size(); - for (ordinal_type t = 0ul; t < n; ++t) { - // Initialize the reduction task - ReducePairTask* MADNESS_RESTRICT const reduce_task = - reduce_tasks_ + t; - new (reduce_task) ReducePairTask(TensorImpl_::world(), op_ + // Iterate over all local tiles + const ordinal_type n = proc_grid_.local_size(); + for (ordinal_type t = 0ul; t < n; ++t) { + // Initialize the reduction task + ReducePairTask* MADNESS_RESTRICT const reduce_task = + reduce_tasks_ + t; + new (reduce_task) ReducePairTask(TensorImpl_::world(), op_ #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - , - nullptr, t + , + nullptr, t #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - ); - } + ); + } - return proc_grid_.local_size(); + return proc_grid_.local_size(); + } } /// Initialize reduce tasks @@ -938,6 +956,9 @@ class Summa ss << " initialize rank=" << TensorImpl_::world().rank() << " tiles={ "; #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE + // fast return if there is no work to do + if (k_ == 0) return 0; + // Allocate memory for the reduce pair tasks. std::allocator> alloc; reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); @@ -1705,60 +1726,79 @@ class Summa std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols())); - // corner case: empty result - if (k_ == 0) return 0; - - // Construct the first SUMMA iteration task - if (TensorImpl_::shape().is_dense()) { - // We cannot have more iterations than there are blocks in the k - // dimension - if (depth > k_) depth = k_; - - // Modify the number of concurrent iterations based on the available - // memory. - depth = mem_bound_depth(depth, 0.0f, 0.0f); - - // Enforce user defined depth bound - if (max_depth_) depth = std::min(depth, max_depth_); - - TensorImpl_::world().taskq.add( - new DenseStepTask(shared_from_this(), depth)); - } else { - // Increase the depth based on the amount of sparsity in an iteration. + // watch out for the corner case: contraction over zero-volume range + // producing nonzero-volume result ... in that case there is nothing to do + // the appropriate initialization was performed in the initialize() method + if (k_ != 0) { + // Construct the first SUMMA iteration task + if (TensorImpl_::shape().is_dense()) { + // We cannot have more iterations than there are blocks in the k + // dimension + if (depth > k_) depth = k_; + + // Modify the number of concurrent iterations based on the available + // memory. + depth = mem_bound_depth(depth, 0.0f, 0.0f); + + // Enforce user defined depth bound + if (max_depth_) depth = std::min(depth, max_depth_); + + TensorImpl_::world().taskq.add( + new DenseStepTask(shared_from_this(), depth)); + } else { + // Increase the depth based on the amount of sparsity in an iteration. - // Get the sparsity fractions for the left- and right-hand arguments. - const float left_sparsity = left_.shape().sparsity(); - const float right_sparsity = right_.shape().sparsity(); + // Get the sparsity fractions for the left- and right-hand arguments. + const float left_sparsity = left_.shape().sparsity(); + const float right_sparsity = right_.shape().sparsity(); - // Compute the fraction of non-zero result tiles in a single SUMMA - // iteration. - const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) * - (1.0f - std::min(right_sparsity, 0.9f)); + // Compute the fraction of non-zero result tiles in a single SUMMA + // iteration. + const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) * + (1.0f - std::min(right_sparsity, 0.9f)); - // Compute the new depth based on sparsity of the arguments - depth = - float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f; + // Compute the new depth based on sparsity of the arguments + depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + + 0.5f; - // We cannot have more iterations than there are blocks in the k - // dimension - if (depth > k_) depth = k_; + // We cannot have more iterations than there are blocks in the k + // dimension + if (depth > k_) depth = k_; - // Modify the number of concurrent iterations based on the available - // memory and sparsity of the argument tensors. - depth = mem_bound_depth(depth, left_sparsity, right_sparsity); + // Modify the number of concurrent iterations based on the available + // memory and sparsity of the argument tensors. + depth = mem_bound_depth(depth, left_sparsity, right_sparsity); - // Enforce user defined depth bound - if (max_depth_) depth = std::min(depth, max_depth_); + // Enforce user defined depth bound + if (max_depth_) depth = std::min(depth, max_depth_); - TensorImpl_::world().taskq.add( - new SparseStepTask(shared_from_this(), depth)); - } + TensorImpl_::world().taskq.add( + new SparseStepTask(shared_from_this(), depth)); + } + } // k_ != 0 } #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL + // corner case: if left or right are zero-volume no tasks were scheduled, so + // need to discard all of their tiles manually + if (left_.range().volume() == 0) { + for (auto&& tile_idx : right_.range()) { + auto tile_ord = right_.range().ordinal(tile_idx); + if (right_.is_local(tile_ord) && !right_.is_zero(tile_ord)) + right_.discard(tile_ord); + } + } + if (right_.range().volume() == 0) { + for (auto&& tile_idx : left_.range()) { + auto tile_ord = left_.range().ordinal(tile_idx); + if (left_.is_local(tile_ord) && !left_.is_zero(tile_ord)) + left_.discard(tile_ord); + } + } + // Wait for child tensors to be evaluated, and process tasks while waiting. left_.wait(); right_.wait(); diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 8795e699c6..94562b5154 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -343,16 +343,26 @@ class ContEngine : public BinaryEngine { n *= right_element_size[i]; } - // Construct the process grid. - proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); - - // Initialize children - left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); - right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); - - // Initialize the process map if not already defined - if (!pmap) pmap = proc_grid_.make_pmap(); - ExprEngine_::init_distribution(world, pmap); + // corner case: zero-volume result ... easier to skip proc_grid_ + // construction alltogether + if (M == 0 || N == 0) { + left_.init_distribution(world, {}); + right_.init_distribution(world, {}); + ExprEngine_::init_distribution( + world, (pmap ? pmap : policy::default_pmap(*world, M * N))); + } else { // M!=0 && N!=0 + + // Construct the process grid. + proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); + + // Initialize children + left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); + right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); + + // Initialize the process map if not already defined + if (!pmap) pmap = proc_grid_.make_pmap(); + ExprEngine_::init_distribution(world, pmap); + } } /// Tiled range factory function diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index a4f7e4cd0b..8e527465d1 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -63,10 +63,12 @@ struct ExpressionsFixture : public TiledRangeFixture { s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), s_trC(make_random_sparseshape(trangeC)), + s_trC_f(make_random_sparseshape(trangeC_f)), a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), aC(*GlobalFixture::world, trangeC, s_trC), + aC_f(*GlobalFixture::world, trangeC_f, s_trC_f), u(*GlobalFixture::world, trange1, s_tr1_1), v(*GlobalFixture::world, trange1, s_tr1_2), w(*GlobalFixture::world, trange2, s_tr2) { @@ -92,12 +94,14 @@ struct ExpressionsFixture : public TiledRangeFixture { u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), w(*GlobalFixture::world, trange2), - aC(*GlobalFixture::world, trangeC) { + aC(*GlobalFixture::world, trangeC), + aC_f(*GlobalFixture::world, trangeC_f) { random_fill(a); random_fill(b); random_fill(u); random_fill(v); random_fill(aC); + random_fill(aC_f); GlobalFixture::world->gop.fence(); } @@ -221,12 +225,17 @@ struct ExpressionsFixture : public TiledRangeFixture { // contains empty trange1 const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{}, TiledRange1{0, 2, 7, 11}}; + // like trC, but with all dimension nonempty + const TiledRange trangeC_f{trangeC.dim(0), TiledRange1{0, 4, 7}, + trangeC.dim(2)}; + SparseShape s_tr_1; SparseShape s_tr_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; SparseShape s_trC; + SparseShape s_trC_f; TArray a; TArray b; TArray c; @@ -234,6 +243,7 @@ struct ExpressionsFixture : public TiledRangeFixture { TArray v; TArray w; TArray aC; + TArray aC_f; }; // ExpressionsFixture #endif // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index ea4beab3d6..268b118568 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -2946,6 +2946,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) { BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { auto& c = F::c; auto& aC = F::aC; + auto& aC_f = F::aC_f; // unary/binary expressions { @@ -2981,6 +2982,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c")); // contraction over nonempty dims BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c")); + // contraction over nonempty dims, involving expressions with nonzero-volume + BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c"))); } // reduction expressions From f0112af07423ce85dfba1cf1f8325a84111b1894 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 10:00:03 -0500 Subject: [PATCH 13/20] [unit] re-add block_range_suite --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3bcf8de967..76bb14e4b1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,6 +34,7 @@ set(executable ta_test) set(ta_test_src_files ta_test.cpp range1.cpp range.cpp + block_range.cpp type_traits.cpp tensor.cpp tensor_of_tensor.cpp From 087202a0e9b00b10638f78a9926431e9be981f11 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 11:46:31 -0500 Subject: [PATCH 14/20] one more zero-volume corner case, in Expr::eval_to(BlkTsrExpr..) --- src/TiledArray/expressions/expr.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index f77d13dbad..c3fdd6423b 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -504,8 +504,11 @@ class Expr { // Move the data from dist_eval into the sub-block of result array. // This step may involve communication when the tiles are moved from the // sub-block distribution to the array distribution. - { + // N.B. handle the corner case of zero-volume host array, then no data needs + // to be moved + if (tsr.array().trange().tiles_range().volume() != 0) { // N.B. must deep copy + TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound())); const container::svector shift = tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(); From 5809ff25258a837783983121a8df8ca2d89f4b32 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 1 Apr 2024 23:29:45 -0400 Subject: [PATCH 15/20] typo --- examples/dgemm/ta_blas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dgemm/ta_blas.cpp b/examples/dgemm/ta_blas.cpp index 0a4feff383..aeefaae908 100644 --- a/examples/dgemm/ta_blas.cpp +++ b/examples/dgemm/ta_blas.cpp @@ -69,7 +69,7 @@ int main(int argc, char** argv) { // Start clock const double wall_time_start = madness::wall_time(); - // Do matrix multiplcation + // Do matrix multiplication // Note: If TiledArray has not been configured with blas, this will be an // eigen call. for (int i = 0; i < repeat; ++i) { From 0dbd0eee07a44f7b577fdf11038addcc1d466883 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 00:00:50 -0400 Subject: [PATCH 16/20] introduced patcher for Intel MKL's unfair dispatch, enabled by configuring with IntelMKL_FAIR_DISPATCH=ON see https://www.agner.org/optimize/intel_dispatch_patch.zip --- CMakeLists.txt | 3 + INSTALL.md | 1 + src/CMakeLists.txt | 10 +++ src/TiledArray/config.h.in | 2 + .../agnerfog/intel_cpu_feature_patch.c | 48 +++++++++++ .../external/agnerfog/intel_mkl_cpuid_patch.c | 61 ++++++++++++++ .../agnerfog/intel_mkl_feature_patch.c | 49 +++++++++++ src/TiledArray/external/agnerfog/readme.txt | 84 +++++++++++++++++++ src/TiledArray/tiledarray.cpp | 7 ++ 9 files changed, 265 insertions(+) create mode 100644 src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c create mode 100644 src/TiledArray/external/agnerfog/readme.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index a97e6561f8..7bc524337b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,9 @@ add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") option(TA_TTG "Enable search/build of TTG library" OFF) add_feature_info(TA_TTG TA_TTG "TTG library") +option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF) +add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL") + # Enable shared library support options redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF) add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED diff --git a/INSTALL.md b/INSTALL.md index 6e7c6fc746..3f669073f0 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -423,6 +423,7 @@ support may be added. * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout. * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray. * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s. +* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF]. # Build TiledArray diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9bb82bf537..0167aab636 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -313,6 +313,16 @@ if( TARGET ttg-parsec ) list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec) endif() +if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL) + message(WARNING "created tiledarray_mkl_dispatch") + add_library(tiledarray_mkl_dispatch OBJECT + TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c + TiledArray/external/agnerfog/intel_mkl_feature_patch.c + ) + # N.B. --allow-multiple-definition is a GNU linker extension + list(APPEND _TILEDARRAY_DEPENDENCIES $ -Wl,--allow-multiple-definition) +endif() + # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on") diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 79f9f0932a..483847067f 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -113,6 +113,8 @@ #endif // !defined(TILEDARRAY_HAS_BTAS) #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL) # define TILEDARRAY_HAS_INTEL_MKL +/* use fair dispatch in Intel MKL? */ +#cmakedefine IntelMKL_FAIR_DISPATCH #endif /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */ diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c new file mode 100644 index 0000000000..f3706ef1fa --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c @@ -0,0 +1,48 @@ +/*********************** intel_cpu_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel compiler version 13.0 and later, including the general + * libraries, LIBM and SVML, but not MKL and VML. + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of generated code with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_cpu_patch(); + * before any call to the library functions. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to Intel libraries +extern int64_t __intel_cpu_feature_indicator; // CPU feature bits +extern int64_t __intel_cpu_feature_indicator_x; // CPU feature bits +void __intel_cpu_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_cpu_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_cpu_patch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_cpu_feature_indicator = 0; + __intel_cpu_feature_indicator_x = 0; + __intel_cpu_features_init_x(); + __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c new file mode 100644 index 0000000000..b88a1807f7 --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c @@ -0,0 +1,61 @@ +/*********************** intel_mkl_cpuid_patch.c ************************** + * Author: Agner Fog + * Date created: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to override Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * + * Include this code in your C or C++ program and make sure it is linked before + * any Intel libraries. You may need to include intel_mkl_feature_patch.c as + *well. + * + * Copyright (c) 2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// detect if Intel CPU +int mkl_serv_intel_cpu() { return 1; } + +// detect if Intel CPU +int mkl_serv_intel_cpu_true() { return 1; } + +int mkl_serv_cpuhaspnr_true() { return 1; } + +int mkl_serv_cpuhaspnr() { return 1; } + +int mkl_serv_cpuhasnhm() { return 1; } + +int mkl_serv_cpuisbulldozer() { return 0; } + +int mkl_serv_cpuiszen() { return 0; } + +int mkl_serv_cpuisatomsse4_2() { return 0; } + +int mkl_serv_cpuisatomssse3() { return 0; } + +int mkl_serv_cpuisitbarcelona() { return 0; } + +int mkl_serv_cpuisskl() { return 0; } + +int mkl_serv_cpuisknm() { return 0; } + +int mkl_serv_cpuisclx() { return 0; } + +int mkl_serv_get_microarchitecture() { + // I don't know what this number means + return 33; +} + +#ifdef __cplusplus +} // end of extern "C" +#endif diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c new file mode 100644 index 0000000000..4844f2621d --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c @@ -0,0 +1,49 @@ +/*********************** intel_mkl_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_mkl_patch(); + * before any call to the MKL functions. You may need to include + * intel_mkl_cpuid_patch.c as well. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to MKL libraries +extern int64_t __intel_mkl_feature_indicator; // CPU feature bits +extern int64_t __intel_mkl_feature_indicator_x; // CPU feature bits +void __intel_mkl_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_mkl_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_mkl_use_fair_dispatch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_mkl_feature_indicator = 0; + __intel_mkl_feature_indicator_x = 0; + __intel_mkl_features_init_x(); + __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt new file mode 100644 index 0000000000..0f891c9ed3 --- /dev/null +++ b/src/TiledArray/external/agnerfog/readme.txt @@ -0,0 +1,84 @@ + intel_dispatch_patch.zip + ======================== + +By Agner Fog, Technical University of Denmark, 2019. + +Intel's compilers are generating code that will run slower than necessary when +the code is executed on a CPU that is not produced by Intel. This has been +observed with Intel C, C++, and Fortran compilers. + +The same happens when certain function libraries produced by Intel are used, +even if the code is compiled with another compiler, such as Microsoft, Gnu +or Clang compilers. + +This problem is affecting several commonly used software programs such as +Matlab, because they are using Intel software libraries. + +The library code and the code generated by an Intel compiler may contain +multiple versions, each optimized for a particular instruction set extension. +A so-called CPU dispatcher is chosing the optimal version of the code at +runtime, based on which CPU it is running on. + +CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the +optimal code based only on which instruction set extensions are supported +by the CPU. An unfair dispatcher first checks the CPU brand. If the brand +is not Intel, then the unfair dispatcher will chose the "generic" version +of the code, i.e. the slowest version that is compatible with old CPUs +without the relevant instruction set extensions. + +The CPU dispatchers in many Intel function libraries have two versions, a +fair and an unfair one. It is not clear when the fair dispatcher is used +and when the unfair dispatcher is used. My observations about fair and +unfair CPU dispatching are as follows: + +* Code compiled with an Intel compiler will usually have unfair CPU dispatching. + +* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives) + function libraries from Intel are using the fair CPU dispatcher when used + with a non-Intel compiler. + +* The MKL (Math Kernel Library) library contains both fair and unfair + dispatchers. It is not clear which dispatcher is used on each function. + +The code examples contained herein may be used for circumventing unfair CPU +dispatching in order to improve compatibility with non-Intel CPUs. + +The following files are contained: + +intel_cpu_feature_patch.c +------------------------- +This code makes sure the fair dispatcher is called instead of the unfair +one for code generated with an Intel compiler and for general Intel +function libraries. + +intel_mkl_feature_patch.c +------------------------- +This does the same for the Intel MKL library. + +intel_mkl_cpuid_patch.c +----------------------- +This code example is overriding CPU detection functions in Intel's MKL +function library. The mkl_serv_intel_cpu() function in MKL is returning +1 when running on an Intel CPU and 0 when running on any other brand of +CPU. You may include this code to replace this function in MKL with a +function that returns 1 regardless of CPU brand. + +It may be necessary to use both intel_mkl_feature_patch.c and +intel_mkl_cpuid_patch.c when using the MKL library in software that +may run on any brand of CPU. + +An alternative method is to set the environment variable + MKL_DEBUG_CPU_TYPE=5 +when running on an AMD processor. This may be useful when you do not have +access to the source code, for example when running Matlab software. + +The patches provided here are based on undocumented features in Intel +function libraries. Use them at your own risk, and make sure to test your +code properly to make sure it works as intended. + +The most reliable solution is, of course, to avoid Intel compilers and +Intel function libraries in code that may run on other CPU brands such +as AMD and VIA. You may find other function libraries on the web, or +you may make your own functions. My vector class library (VCL) is useful +for making mathematical functions that process multiple data in parallel, +using the vector processing features of modern CPUs. diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 38bf61e86e..2a4b3d1199 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -16,6 +16,10 @@ #include #endif +#ifdef IntelMKL_FAIR_DISPATCH +extern "C" void intel_mkl_use_fair_dispatch(); +#endif + #include #include #include @@ -100,6 +104,9 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, TiledArray::set_default_world(default_world); #ifdef TILEDARRAY_HAS_DEVICE TiledArray::device_initialize(); +#endif +#ifdef IntelMKL_FAIR_DISPATCH + intel_mkl_use_fair_dispatch(); #endif TiledArray::max_threads = TiledArray::get_num_threads(); TiledArray::set_num_threads(1); From 1af2613553a2c8853253921d28c3d7e1b7994a9b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 11:53:40 -0400 Subject: [PATCH 17/20] introduced duration deque and statistics computation --- src/TiledArray/util/time.h | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/TiledArray/util/time.h b/src/TiledArray/util/time.h index aa0639bc0a..8ae649a6af 100644 --- a/src/TiledArray/util/time.h +++ b/src/TiledArray/util/time.h @@ -26,7 +26,10 @@ #ifndef TILEDARRAY_UTIL_TIME_H__INCLUDED #define TILEDARRAY_UTIL_TIME_H__INCLUDED +#include #include +#include +#include namespace TiledArray { @@ -46,6 +49,86 @@ inline int64_t duration_in_ns(time_point const &t0, time_point const &t1) { return std::chrono::duration_cast(t1 - t0).count(); } +namespace detail { +inline std::deque &call_durations_accessor() { + static std::deque call_durations; + return call_durations; +} +} // namespace detail + +/// Access recorded durations +inline const std::deque &durations() { + return detail::call_durations_accessor(); +} + +/// Clear recorded durations +inline void clear_durations() { detail::call_durations_accessor().clear(); } + +/// Record duration since the given time point +/// \param tp_start The start time point +inline void record_duration_since(const time_point &tp_start) { + detail::call_durations_accessor().push_back(duration_in_s(tp_start, now())); +} + +/// Record duration of a single function call +template +void record_duration(F &&f, Args &&...args) { + auto tp_start = now(); + std::forward(f)(std::forward(args)...); + record_duration_since(tp_start); +} + +/// Statistics of recorded durations +struct duration_stats_t { + double min = 0.0; + double max = 0.0; + double mean = 0.0; + double stddev = 0.0; + double median = 0.0; + double mean_reciprocal = 0.0; +}; + +/// Compute statistics of recorded durations +/// \return Statistics of recorded durations +inline duration_stats_t duration_statistics() { + duration_stats_t stats; + auto &durations = detail::call_durations_accessor(); + if (durations.empty()) return stats; + + stats.min = durations.front(); + stats.max = durations.front(); + stats.mean = durations.front(); + stats.mean_reciprocal = 1.0 / durations.front(); + double total = stats.mean; + double total_reciprocal = stats.mean_reciprocal; + for (size_t i = 1; i < durations.size(); ++i) { + total += durations[i]; + total_reciprocal += 1. / durations[i]; + stats.min = std::min(stats.min, durations[i]); + stats.max = std::max(stats.max, durations[i]); + } + stats.mean = total / durations.size(); + stats.mean_reciprocal = total_reciprocal / durations.size(); + + double sum_sq = 0.0; + for (size_t i = 0; i < durations.size(); ++i) { + sum_sq += (durations[i] - stats.mean) * (durations[i] - stats.mean); + } + stats.stddev = + durations.size() > 1 ? std::sqrt(sum_sq / (durations.size() - 1)) : 0.0; + + std::sort(durations.begin(), durations.end()); + stats.median = durations[durations.size() / 2]; + + return stats; +} + } // namespace TiledArray +#ifndef TA_RECORD_DURATION +/// Record duration of a statement +#define TA_RECORD_DURATION(statement) \ + TiledArray::record_duration([&] { statement; }); +#endif // !defined(TA_RECORD_DURATION) + #endif // TILEDARRAY_UTIL_TIME_H__INCLUDED From c49adefd5ef40a8a401cc25f997019d9b69c0445 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 11:55:24 -0400 Subject: [PATCH 18/20] examples/dgemm -> examples/gemm + misc cleanup - cleanup examples to use centralized timers/statistics facilities - remove dense_new_tile --- examples/CMakeLists.txt | 2 +- examples/dgemm/ta_dense_new_tile.cpp | 168 ------------------ examples/{dgemm => gemm}/CMakeLists.txt | 2 +- examples/{dgemm => gemm}/README | 4 +- .../block_size_data_process.py | 0 examples/{dgemm => gemm}/block_size_scan.sh | 0 examples/{dgemm => gemm}/ta_band.cpp | 32 ++-- examples/{dgemm => gemm}/ta_blas.cpp | 29 ++- examples/{dgemm => gemm}/ta_cc_abcd.cpp | 41 +++-- examples/{dgemm => gemm}/ta_dense.cpp | 29 ++- examples/{dgemm => gemm}/ta_dense_asymm.cpp | 28 ++- .../{dgemm => gemm}/ta_dense_nonuniform.cpp | 27 +-- examples/{dgemm => gemm}/ta_eigen.cpp | 23 +-- examples/{dgemm => gemm}/ta_sparse.cpp | 0 examples/{dgemm => gemm}/ta_sparse_grow.cpp | 0 15 files changed, 99 insertions(+), 286 deletions(-) delete mode 100644 examples/dgemm/ta_dense_new_tile.cpp rename examples/{dgemm => gemm}/CMakeLists.txt (94%) rename examples/{dgemm => gemm}/README (92%) rename examples/{dgemm => gemm}/block_size_data_process.py (100%) rename examples/{dgemm => gemm}/block_size_scan.sh (100%) rename examples/{dgemm => gemm}/ta_band.cpp (86%) rename examples/{dgemm => gemm}/ta_blas.cpp (80%) rename examples/{dgemm => gemm}/ta_cc_abcd.cpp (93%) rename examples/{dgemm => gemm}/ta_dense.cpp (89%) rename examples/{dgemm => gemm}/ta_dense_asymm.cpp (92%) rename examples/{dgemm => gemm}/ta_dense_nonuniform.cpp (87%) rename examples/{dgemm => gemm}/ta_eigen.cpp (76%) rename examples/{dgemm => gemm}/ta_sparse.cpp (100%) rename examples/{dgemm => gemm}/ta_sparse_grow.cpp (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 99edd4e33b..d240192893 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -30,7 +30,7 @@ endif() # Add Subdirectories add_subdirectory (cc) add_subdirectory (device) -add_subdirectory (dgemm) +add_subdirectory (gemm) add_subdirectory (demo) add_subdirectory (scalapack) add_subdirectory (fock) diff --git a/examples/dgemm/ta_dense_new_tile.cpp b/examples/dgemm/ta_dense_new_tile.cpp deleted file mode 100644 index 79dae8a579..0000000000 --- a/examples/dgemm/ta_dense_new_tile.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2013 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - */ - -#include -#include -#include - -using Tile_t = TiledArray::Tile>; -using Array_t = TiledArray::DistArray; - -void set_tiles(double val, Array_t& a) { - auto const& trange = a.trange(); - - auto pmap = a.pmap(); - const auto end = pmap->end(); - for (auto it = pmap->begin(); it != end; ++it) { - auto range = trange.make_tile_range(*it); - a.set(*it, Tile_t(TiledArray::Tensor(range, val))); - } -} - -int main(int argc, char** argv) { - int rc = 0; - - try { - // Initialize runtime - TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv); - - // Get command line arguments - if (argc < 2) { - std::cout << "Usage: " << argv[0] - << " matrix_size block_size [repetitions]\n"; - return 0; - } - const long matrix_size = atol(argv[1]); - const long block_size = atol(argv[2]); - if (matrix_size <= 0) { - std::cerr << "Error: matrix size must be greater than zero.\n"; - return 1; - } - if (block_size <= 0) { - std::cerr << "Error: block size must be greater than zero.\n"; - return 1; - } - if ((matrix_size % block_size) != 0ul) { - std::cerr << "Error: matrix size must be evenly divisible by block " - "size.\n"; - return 1; - } - const long repeat = (argc >= 4 ? atol(argv[3]) : 5); - if (repeat <= 0) { - std::cerr << "Error: number of repetitions must be greater than zero.\n"; - return 1; - } - - const std::size_t num_blocks = matrix_size / block_size; - const std::size_t block_count = num_blocks * num_blocks; - - if (world.rank() == 0) - std::cout << "TiledArray: dense matrix multiply test..." - << "\nGit description: " << TiledArray::git_description() - << "\nNumber of nodes = " << world.size() - << "\nMatrix size = " << matrix_size << "x" - << matrix_size << "\nBlock size = " << block_size - << "x" << block_size << "\nMemory per matrix = " - << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9 - << " GB\nNumber of blocks = " << block_count - << "\nAverage blocks/node = " - << double(block_count) / double(world.size()) << "\n"; - - const double flop = - 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; - - // Construct TiledRange - std::vector blocking; - blocking.reserve(num_blocks + 1); - for (long i = 0l; i <= matrix_size; i += block_size) blocking.push_back(i); - - std::vector blocking2( - 2, TiledArray::TiledRange1(blocking.begin(), blocking.end())); - - TiledArray::TiledRange trange(blocking2.begin(), blocking2.end()); - - // Construct and initialize arrays - Array_t a(world, trange); - Array_t b(world, trange); - Array_t c(world, trange); - set_tiles(1.0, a); - set_tiles(1.0, b); - - TiledArray::TArrayD a_check(world, trange); - TiledArray::TArrayD b_check(world, trange); - TiledArray::TArrayD c_check(world, trange); - a_check.fill(1.0); - b_check.fill(1.0); - - // Start clock - world.gop.fence(); - if (world.rank() == 0) - std::cout << "Starting iterations: " - << "\n"; - - double total_time = 0.0; - - // Do matrix multiplication - for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - c_check("m,n") = a_check("m,k") * b_check("k,n"); - // world.gop.fence(); - const double time = madness::wall_time() - start; - total_time += time; - if (world.rank() == 0) - std::cout << "Iteration " << i + 1 << " time=" << time - << " GFLOPS=" << flop / time << "\n"; - auto check_it = c_check.begin(); - for (auto it = c.begin(); it != c.end() && check_it != c_check.end(); - ++it, ++check_it) { - auto tile_diff = it->get().tensor().subt(check_it->get()).norm(); - if (tile_diff >= 1e-15) { - std::cout << "Tile " << it.ordinal() << " failed test " - << " with norm diff " << tile_diff << std::endl; - assert(false); - } - } - } - - // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << double(repeat) * flop / total_time << "\n"; - - } catch (TiledArray::Exception& e) { - std::cerr << "!! TiledArray exception: " << e.what() << "\n"; - rc = 1; - } catch (madness::MadnessException& e) { - std::cerr << "!! MADNESS exception: " << e.what() << "\n"; - rc = 1; - } catch (SafeMPI::Exception& e) { - std::cerr << "!! SafeMPI exception: " << e.what() << "\n"; - rc = 1; - } catch (std::exception& e) { - std::cerr << "!! std exception: " << e.what() << "\n"; - rc = 1; - } catch (...) { - std::cerr << "!! exception: unknown exception\n"; - rc = 1; - } - - return rc; -} diff --git a/examples/dgemm/CMakeLists.txt b/examples/gemm/CMakeLists.txt similarity index 94% rename from examples/dgemm/CMakeLists.txt rename to examples/gemm/CMakeLists.txt index 47df67bf36..5808cdec6e 100644 --- a/examples/dgemm/CMakeLists.txt +++ b/examples/gemm/CMakeLists.txt @@ -26,7 +26,7 @@ # Create example executable foreach(_exec ta_blas ta_eigen ta_band ta_dense ta_sparse ta_dense_nonuniform - ta_dense_asymm ta_sparse_grow ta_dense_new_tile + ta_dense_asymm ta_sparse_grow ta_cc_abcd) # Add executable diff --git a/examples/dgemm/README b/examples/gemm/README similarity index 92% rename from examples/dgemm/README rename to examples/gemm/README index bbb80e88c0..de156f154d 100644 --- a/examples/dgemm/README +++ b/examples/gemm/README @@ -12,9 +12,9 @@ Applications usage: ta_band matrix_size block_size band_width [repetitions] - blas matrix_size [repetitions] + ta_blas matrix_size [repetitions] - eigen matrix_size [repetitions] + ta_eigen matrix_size [repetitions] Argument definitions: diff --git a/examples/dgemm/block_size_data_process.py b/examples/gemm/block_size_data_process.py similarity index 100% rename from examples/dgemm/block_size_data_process.py rename to examples/gemm/block_size_data_process.py diff --git a/examples/dgemm/block_size_scan.sh b/examples/gemm/block_size_scan.sh similarity index 100% rename from examples/dgemm/block_size_scan.sh rename to examples/gemm/block_size_scan.sh diff --git a/examples/dgemm/ta_band.cpp b/examples/gemm/ta_band.cpp similarity index 86% rename from examples/dgemm/ta_band.cpp rename to examples/gemm/ta_band.cpp index d55550cebd..0743ef734b 100644 --- a/examples/dgemm/ta_band.cpp +++ b/examples/gemm/ta_band.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include @@ -104,38 +105,33 @@ int main(int argc, char** argv) { for (; j < j_end; ++j, ++ij) shape_tensor[ij] = 1.0; } - TiledArray::SparseShape shape(shape_tensor, trange); + TiledArray::SparseShape shape( + shape_tensor, trange, /* per_element_norms_already = */ true); // Construct and initialize arrays TiledArray::TSpArrayD a(world, trange, shape); TiledArray::TSpArrayD b(world, trange, shape); - TiledArray::TSpArrayD c(world, trange); + TiledArray::TSpArrayD c; a.fill(1.0); b.fill(1.0); - // Start clock - world.gop.fence(); - const double wall_time_start = madness::wall_time(); - // Do matrix multiplication + world.gop.fence(); for (int i = 0; i < repeat; ++i) { - c("m,n") = a("m,k") * b("k,n"); - world.gop.fence(); + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();) if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n"; } - // Stop clock - const double wall_time_stop = madness::wall_time(); - // Print results - const long flop = 2.0 * c("m,n").sum().get(); + const auto gflops_per_call = 2.0 * c("m,n").sum().get() / 1.e9; if (world.rank() == 0) { - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * double(flop) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; } } catch (TiledArray::Exception& e) { diff --git a/examples/dgemm/ta_blas.cpp b/examples/gemm/ta_blas.cpp similarity index 80% rename from examples/dgemm/ta_blas.cpp rename to examples/gemm/ta_blas.cpp index aeefaae908..c97f5bbedc 100644 --- a/examples/dgemm/ta_blas.cpp +++ b/examples/gemm/ta_blas.cpp @@ -17,13 +17,14 @@ * */ +#include #include #include int main(int argc, char** argv) { // Get command line arguments if (argc < 2) { - std::cout << "Usage: " << argv[0] << " matrix_size [repetitions]\n"; + std::cout << "Usage: " << argv[0] << " matrix_size [repetitions = 5]\n"; return 0; } const long matrix_size = atol(argv[1]); @@ -66,31 +67,25 @@ int main(int argc, char** argv) { const integer m = matrix_size, n = matrix_size, k = matrix_size; const integer lda = matrix_size, ldb = matrix_size, ldc = matrix_size; - // Start clock - const double wall_time_start = madness::wall_time(); - // Do matrix multiplication - // Note: If TiledArray has not been configured with blas, this will be an - // eigen call. for (int i = 0; i < repeat; ++i) { - gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + TA_RECORD_DURATION( + gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)); } - - // Stop clock - const double wall_time_stop = madness::wall_time(); + auto durations = TiledArray::duration_statistics(); // Cleanup memory free(a); free(b); free(c); - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * 2.0 * - double(matrix_size * matrix_size * matrix_size) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + const auto gflops_per_call = + 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; + std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << "\nMedian GFLOPS = " << gflops_per_call / durations.median + << std::endl; return 0; } diff --git a/examples/dgemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp similarity index 93% rename from examples/dgemm/ta_cc_abcd.cpp rename to examples/gemm/ta_cc_abcd.cpp index c1881063d4..df05e78c4a 100644 --- a/examples/dgemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -35,13 +36,13 @@ bool to_bool(const char* str) { // the last tile absorbs the remainder std::vector make_tiling(unsigned int range_size, unsigned int ntiles) { - const auto average_tile_size = range_size / ntiles; - TA_ASSERT(average_tile_size > ntiles); + const int average_tile_size = range_size / ntiles; std::vector result(ntiles + 1); result[0] = 0; for (long t = 0; t != ntiles - 1; ++t) { - result[t + 1] = - result[t] + average_tile_size + ((t % 2 == 0) ? (t + 1) : (-t)); + result[t + 1] = result[t] + average_tile_size + + std::max(static_cast((t % 2 == 0) ? (t + 1) : (-t)), + 1 - average_tile_size); } result[ntiles] = range_size; return result; @@ -174,8 +175,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, const double flops_per_fma = (complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real - const double n_gflop = flops_per_fma * std::pow(n_occ, 2) * - std::pow(n_uocc, 4) / std::pow(1024., 3); + const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) * + std::pow(n_uocc, 4) / std::pow(1024., 3); // Construct tensors TA::TArrayD t2(world, trange_oovv); @@ -196,13 +197,9 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - + auto tp_start = TiledArray::now(); // this is how the user would express this contraction if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d"); @@ -223,23 +220,25 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, << error("i,j,a,b").squared_norm().get() << std::endl; } } + TiledArray::record_duration_since(tp_start); - const double stop = madness::wall_time(); - const double time = stop - start; - total_time += time; - const double gflop_rate = n_gflop / time; - total_gflop_rate += gflop_rate; + const double time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " - << total_time / static_cast(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / static_cast(repeat) << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } template diff --git a/examples/dgemm/ta_dense.cpp b/examples/gemm/ta_dense.cpp similarity index 89% rename from examples/dgemm/ta_dense.cpp rename to examples/gemm/ta_dense.cpp index 82506b1d0d..c0ffebd4dc 100644 --- a/examples/dgemm/ta_dense.cpp +++ b/examples/gemm/ta_dense.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -129,7 +130,7 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange, const auto n = trange.elements_range().extent()[0]; const auto complex_T = TiledArray::detail::is_complex::value; - const double gflop = + const double gflops_per_call = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real * double(n * n * n) / 1.0e9; @@ -168,28 +169,26 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange, std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - memtrace("c=a*b"); - const double time = madness::wall_time() - start; - total_time += time; - const double gflop_rate = gflop / time; - total_gflop_rate += gflop_rate; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");) + const auto time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / double(repeat) << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } // array lifetime scope memtrace("stop"); diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp similarity index 92% rename from examples/dgemm/ta_dense_asymm.cpp rename to examples/gemm/ta_dense_asymm.cpp index 40183603bb..acef959c7a 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/gemm/ta_dense_asymm.cpp @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -125,11 +126,11 @@ int main(int argc, char** argv) { using Array = std::decay_t>; using T = TiledArray::detail::numeric_t; const auto complex_T = TiledArray::detail::is_complex_v; - const std::int64_t nflops = + const double gflops_per_call = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real * static_cast(Nn) * static_cast(Nm) * - static_cast(Nk); + static_cast(Nk) / 1.e9; if (world.rank() == 0) std::cout << "TiledArray: dense matrix multiply test...\n" @@ -182,18 +183,11 @@ int main(int argc, char** argv) { std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - memtrace("c=a*b"); - const double time = madness::wall_time() - start; - total_time += time; - const double gflop_rate = double(nflops) / (time * 1.e9); - total_gflop_rate += gflop_rate; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");) + const double time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; @@ -203,9 +197,13 @@ int main(int argc, char** argv) { const double wall_time_stop = madness::wall_time(); if (world.rank() == 0) { - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / double(repeat) << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; } } // array lifetime scope diff --git a/examples/dgemm/ta_dense_nonuniform.cpp b/examples/gemm/ta_dense_nonuniform.cpp similarity index 87% rename from examples/dgemm/ta_dense_nonuniform.cpp rename to examples/gemm/ta_dense_nonuniform.cpp index c01a4ece11..20e8cce712 100644 --- a/examples/dgemm/ta_dense_nonuniform.cpp +++ b/examples/gemm/ta_dense_nonuniform.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -58,7 +59,7 @@ int main(int argc, char** argv) { const long num_blocks = matrix_size / block_size; const long block_count = num_blocks * num_blocks; - const double flop = + const double gflops_per_call = 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; // Construct TiledRange @@ -121,25 +122,25 @@ int main(int argc, char** argv) { std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - // world.gop.fence(); - const double time = madness::wall_time() - start; - total_time += time; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();) + const double time = TiledArray::durations().back(); if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time - << " GFLOPS=" << flop / time << "\n"; + << " GFLOPS=" << gflops_per_call / time << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << double(repeat) * flop / total_time << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } catch (TiledArray::Exception& e) { std::cerr << "!! TiledArray exception: " << e.what() << "\n"; diff --git a/examples/dgemm/ta_eigen.cpp b/examples/gemm/ta_eigen.cpp similarity index 76% rename from examples/dgemm/ta_eigen.cpp rename to examples/gemm/ta_eigen.cpp index 0aa5474cd6..018de9a81f 100644 --- a/examples/dgemm/ta_eigen.cpp +++ b/examples/gemm/ta_eigen.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include @@ -50,24 +51,16 @@ int main(int argc, char** argv) { b.fill(1.0); c.fill(0.0); - // Start clock - const double wall_time_start = madness::wall_time(); - - // Do matrix multiplcation + // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - c.noalias() = 1.0 * a * b + 0.0 * c; + TA_RECORD_DURATION(c.noalias() = 1.0 * a * b + 0.0 * c); } - // Stop clock - const double wall_time_stop = madness::wall_time(); - - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * 2.0 * - double(matrix_size * matrix_size * matrix_size) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = " + << (2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9) * + durations.mean_reciprocal + << std::endl; return 0; } diff --git a/examples/dgemm/ta_sparse.cpp b/examples/gemm/ta_sparse.cpp similarity index 100% rename from examples/dgemm/ta_sparse.cpp rename to examples/gemm/ta_sparse.cpp diff --git a/examples/dgemm/ta_sparse_grow.cpp b/examples/gemm/ta_sparse_grow.cpp similarity index 100% rename from examples/dgemm/ta_sparse_grow.cpp rename to examples/gemm/ta_sparse_grow.cpp From 1ce2c33dcaa37cd688205aab9679515d0b3e3f6f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 17:36:17 -0400 Subject: [PATCH 19/20] fetching eigen3 via ExternalProject_Add's needs to specify STAMP_DIR and TMP_DIR just like librett and umpire --- external/eigen.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/external/eigen.cmake b/external/eigen.cmake index f2d28076dd..1489b53fe6 100644 --- a/external/eigen.cmake +++ b/external/eigen.cmake @@ -104,7 +104,9 @@ else() ExternalProject_Add(eigen3 PREFIX ${CMAKE_INSTALL_PREFIX} - #--Download step-------------- + STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts + TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable + #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} URL ${EIGEN3_URL} URL_HASH ${EIGEN3_URL_HASH} From 5e22878f2775f711286fcb03b4ddfaf46f089100 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Apr 2024 08:30:25 -0400 Subject: [PATCH 20/20] to avoid premature creation of install directory ExternalProject_add will use FETCHCONTENT_BASE_DIR as PREFIX --- external/eigen.cmake | 2 +- external/librett.cmake | 2 +- external/umpire.cmake | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/external/eigen.cmake b/external/eigen.cmake index 1489b53fe6..57bbead90d 100644 --- a/external/eigen.cmake +++ b/external/eigen.cmake @@ -103,7 +103,7 @@ else() message("** Will build Eigen from ${EIGEN3_URL}") ExternalProject_Add(eigen3 - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- diff --git a/external/librett.cmake b/external/librett.cmake index c04cf56b38..afebabb486 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -109,7 +109,7 @@ else() message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}") ExternalProject_Add(librett - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- diff --git a/external/umpire.cmake b/external/umpire.cmake index c7a02d65bf..57675ca189 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -163,7 +163,7 @@ else() message(STATUS "custom target Umpire is expected to build these byproducts: ${UMPIRE_BUILD_BYPRODUCTS}") ExternalProject_Add(Umpire - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step--------------