From d31ad7b1801d01da498754ac28bffe1cd30b863b Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 10 Nov 2023 15:27:45 -0500
Subject: [PATCH 01/19] Add full support for nonuniform tile sizes to gesv
 tester

---
 test/matrix_utils.hh | 132 ++++++++++++++++++++++++++++++++++++++++
 test/test.cc         |   2 +-
 test/test_gesv.cc    | 142 ++++++++++---------------------------------
 3 files changed, 165 insertions(+), 111 deletions(-)
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index e9fd18412..1ff2df78d 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -203,4 +203,136 @@ matrix_type matrix_cast(
     return matrix_cast_impl( MatrixType< matrix_type >(), A, uplo, diag );
 }
 
+
+// -----------------------------------------------------------------------------
+// Functions for allocating test matrices
+
+template <typename MatrixType>
+class TestMatrix {
+    using scalar_t = typename MatrixType::value_type;
+
+public:
+    // SLATE matrices
+    MatrixType A;
+    MatrixType Aref;
+
+    // Storage for ScaLAPACK matrices
+    std::vector<scalar_t> A_data;
+    std::vector<scalar_t> Aref_data;
+
+    // ScaLAPACK configuration
+    int64_t mloc, nloc, lld, nb;
+};
+
+// -----------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_Matrix
+inline void mark_params_for_test_Matrix(Params& params)
+{
+    params.grid.m();
+    params.grid.n();
+    params.nb();
+    params.nonuniform_nb();
+    params.origin();
+    params.grid_order();
+}
+
+// -----------------------------------------------------------------------------
+/// Allocates a Matrix<scalar_t> and a reference version for testing.
+///
+/// @param run[in]
+///     Whether to actaully allocate the matrix, instead of just marking params
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+        bool ref_matrix,
+        int64_t m,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    int64_t p = params.grid.m();
+    int64_t q = params.grid.n();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<slate::Matrix<scalar_t>> matrix;
+
+    // ScaLAPACK variables
+    int mpi_rank, myrow, mycol;
+    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
+    matrix.nb = nb;
+    matrix.mloc = num_local_rows_cols( m, nb, myrow, p );
+    matrix.nloc = num_local_rows_cols (n, nb, mycol, q );
+    matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
+
+    // Functions for nonuniform tile sizes
+    std::function< int64_t (int64_t j) >
+    tileNb = [nb](int64_t j) {
+        // for non-uniform tile size
+        return (j % 2 != 0 ? nb*2 : nb);
+    };
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target( origin );
+        if (nonuniform_nb) {
+
+            matrix.A = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
+                                                tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = slate::Matrix<scalar_t>( m, n, nb, nb,
+                                                grid_order, p, q, MPI_COMM_WORLD );
+        }
+        matrix.A.insertLocalTiles( origin_target );
+    }
+    else {
+        assert( !nonuniform_nb );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
+                    m, n, &matrix.A_data[0], matrix.lld, nb, nb,
+                    grid_order, p, q, MPI_COMM_WORLD );
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb) {
+            matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
+                                                   tileDevice, MPI_COMM_WORLD );
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
+                       m, n, &matrix.Aref_data[0], matrix.lld, nb, nb,
+                       grid_order, p, q, MPI_COMM_WORLD );
+        }
+    }
+
+    return matrix;
+}
+
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/test.cc b/test/test.cc
index 2d46f782f..ebf397f78 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -404,7 +404,7 @@ Params::Params():
                                                           0, 1000000, "(pt) max number of threads used in panel; default omp_num_threads / 2"),
     align     ("align",   5,    ParamType::List,  32,     1,    1024, "column alignment (sets lda, ldb, etc. to multiple of align)"),
     nonuniform_nb("nonuniform_nb",
-                          0,    ParamType::Value, 'n', "ny", "generate matrix with nonuniform tile sizes"),
+                          0,    ParamType::List, 'n', "ny", "generate matrix with nonuniform tile sizes"),
     debug     ("debug",   0,    ParamType::Value, -1,     0, 1000000,
                "given rank waits for debugger (gdb/lldb) to attach"),
     pivot_threshold(
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index 5a21e955d..d25240d01 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -9,6 +9,7 @@
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
 
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
@@ -60,7 +61,6 @@ void test_gesv_work(Params& params, bool run)
     int64_t nrhs = params.nrhs();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t ib = params.ib();
     int64_t lookahead = params.lookahead();
     int64_t panel_threads = params.panel_threads();
@@ -78,6 +78,8 @@ void test_gesv_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // Currently only gesv* supports timer_level >= 2.
     std::vector<std::string> timer_lvl_support{ "gesv", "gesv_mixed",
                                                 "gesv_mixed_gmres"};
@@ -154,20 +156,9 @@ void test_gesv_work(Params& params, bool run)
     if (! run)
         return;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-
-    if (nonuniform_nb) {
-        if (ref || origin == slate::Origin::ScaLAPACK) {
-            params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
-            return;
-        }
-        params.ref() = 'n';
-        params.origin() = slate::Origin::Host;
-        ref = false;
-        origin = slate::Origin::Host;
+    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
+        return;
     }
 
     if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres")
@@ -192,78 +183,23 @@ void test_gesv_work(Params& params, bool run)
 
     int64_t info = 0;
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // To generate matrix with non-uniform tile size using the lambda constructor
-    std::function< int64_t (int64_t j) >
-    tileNb = [nb](int64_t j) {
-        // for non-uniform tile size
-        return (j % 2 != 0 ? nb/2 : nb);
-    };
-
-    // rank and device functions for using the lambda constructor
-    auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
-                                                   p, num_devices_ );
-
-    std::vector<scalar_t> A_data, B_data, X_data;
-    slate::Matrix<scalar_t> A, B, X;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        if (nonuniform_nb) {
-            A = slate::Matrix<scalar_t>(m, n, tileNb, tileNb, tileRank,
-                                        tileDevice, MPI_COMM_WORLD);
-            B = slate::Matrix<scalar_t>(n, nrhs, tileNb, tileNb, tileRank,
-                                        tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            A = slate::Matrix<scalar_t>(
-                m, n,    nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-            B = slate::Matrix<scalar_t>(
-                n, nrhs, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        }
-        A.insertLocalTiles(origin_target);
-        B.insertLocalTiles(origin_target);
-
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            if (nonuniform_nb) {
-                X = slate::Matrix<scalar_t>(n, nrhs, tileNb, tileNb, tileRank,
-                                            tileDevice, MPI_COMM_WORLD);
-            }
-            else {
-                X = slate::Matrix<scalar_t>(
-                    n, nrhs, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-            }
-            X.insertLocalTiles(origin_target);
-        }
+    auto A_allocation = allocate_test_Matrix<scalar_t>( check || ref,
+                                                        m, n, params );
+    auto B_allocation = allocate_test_Matrix<scalar_t>( check || ref,
+                                                        n, nrhs, params );
+    TestMatrix<slate::Matrix<scalar_t>> X_allocation;
+    if (is_iterative) {
+        X_allocation = allocate_test_Matrix<scalar_t>( false,
+                                                       n, nrhs, params );
     }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layouts
-        A_data.resize( lldA * nlocA );
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-            m, n, &A_data[0], lldA, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-
-        B_data.resize( lldB * nlocB );
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-            n, nrhs, &B_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
 
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            X = slate::Matrix<scalar_t>::fromScaLAPACK(
-                n, nrhs, &X_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        }
-    }
+    auto& A         = A_allocation.A;
+    auto& Aref      = A_allocation.Aref;
+    auto& Aref_data = A_allocation.Aref_data;
+    auto& B         = B_allocation.A;
+    auto& Bref      = B_allocation.Aref;
+    auto& Bref_data = B_allocation.Aref_data;
+    auto& X         = X_allocation.A;
 
     slate::Pivots pivots;
 
@@ -272,28 +208,7 @@ void test_gesv_work(Params& params, bool run)
     slate::generate_matrix(params.matrixB, B, matgen_opts);
 
     // If check/ref is required, copy test data.
-    slate::Matrix<scalar_t> Aref, Bref;
-    std::vector<scalar_t> Aref_data, Bref_data;
     if (check || ref) {
-        if (nonuniform_nb) {
-            Aref = slate::Matrix<scalar_t>(m, n, tileNb, tileNb, tileRank,
-                                           tileDevice, MPI_COMM_WORLD);
-            Bref = slate::Matrix<scalar_t>(n, nrhs, tileNb, tileNb, tileRank,
-                                           tileDevice, MPI_COMM_WORLD);
-            Aref.insertLocalTiles( slate::Target::Host );
-            Bref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            Aref_data.resize( lldA* nlocA );
-            Bref_data.resize( lldB* nlocB );
-            Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       m, n, &Aref_data[0], lldA, nb, nb,
-                       grid_order, p, q, MPI_COMM_WORLD );
-            Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       n, nrhs, &Bref_data[0], lldB, nb, nb,
-                       grid_order, p, q, MPI_COMM_WORLD );
-        }
-
         slate::copy(A, Aref);
         slate::copy(B, Bref);
     }
@@ -474,10 +389,15 @@ void test_gesv_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
             if (nonuniform_nb) {
-                printf("Unsupported to test nonuniform tile size using scalapack\n");
+                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
                 return;
             }
 
+            // MPI variables
+            int mpi_rank, myrow, mycol;
+            MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+            gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
+
             // BLACS/MPI variables
             blas_int ictxt, myrow_, mycol_, p_, q_;
             blas_int mpi_rank_ = 0, nprocs = 1;
@@ -492,17 +412,19 @@ void test_gesv_work(Params& params, bool run)
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
+            int nb = A_allocation.nb;
+
             // ScaLAPACK descriptor for the reference matrix
             blas_int Aref_desc[9];
-            scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info);
+            scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, A_allocation.mloc, &info);
             slate_assert(info == 0);
 
             blas_int Bref_desc[9];
-            scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info);
+            scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, B_allocation.mloc, &info);
             slate_assert(info == 0);
 
             // ScaLAPACK data for pivots.
-            std::vector<blas_int> ipiv_ref(lldA + nb);
+            std::vector<blas_int> ipiv_ref(A_allocation.lld + nb);
 
             if (params.routine == "getrs") {
                 // Factor matrix A.

From 9d641bda562479a605cbc316471f2942e83945ee Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 13 Nov 2023 11:30:57 -0500
Subject: [PATCH 02/19] Improve robustness of BaseMatrix

---
 include/slate/BaseMatrix.hh             | 34 ++++++++++++++++++++++---
 include/slate/internal/MatrixStorage.hh |  9 ++++---
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh
index f4c666184..d263a3bd7 100644
--- a/include/slate/BaseMatrix.hh
+++ b/include/slate/BaseMatrix.hh
@@ -904,8 +904,32 @@ BaseMatrix<scalar_t>::BaseMatrix(
         ++nt_;
     }
 
+    // Pass variables into lambda w/out full this object
+    int64_t mt_1 = mt_-1;
+    int64_t last_mb = last_mb_;
+    int64_t nt_1 = nt_-1;
+    int64_t last_nb = last_nb_;
+    std::function<int64_t (int64_t i)>
+    realTileMb = [inTileMb, last_mb, mt_1](int64_t i) {
+        if (i == mt_1) {
+            return last_mb;
+        }
+        else {
+            return inTileMb( i );
+        }
+    };
+    std::function<int64_t (int64_t j)>
+    realTileNb = [inTileNb, last_nb, nt_1](int64_t j) {
+        if (j == nt_1) {
+            return last_nb;
+        }
+        else {
+            return inTileNb( j );
+        }
+    };
+
     storage_ = std::make_shared< MatrixStorage< scalar_t > >(
-                                           mt_, nt_, inTileMb, inTileNb,
+                                           mt_, nt_, realTileMb, realTileNb,
                                            inTileRank, inTileDevice, mpi_comm );
 
     slate_mpi_call(
@@ -2686,6 +2710,11 @@ void BaseMatrix<scalar_t>::tileGet(int64_t i, int64_t j, int dst_device,
                         src_tile->layout() :
                         Layout(layout);
     }
+    else {
+        target_layout = layout == LayoutConvert::None ?
+                        tile_node[dst_device]->layout() :
+                        Layout(layout);
+    }
 
     if (! tile_node.existsOn(dst_device)) {
         // Create a copy on the destination.
@@ -2700,8 +2729,7 @@ void BaseMatrix<scalar_t>::tileGet(int64_t i, int64_t j, int dst_device,
         tileCopyDataLayout( src_tile, dst_tile, target_layout, async );
 
         dst_tile->state(MOSI::Shared);
-        if (src_tile->stateOn(MOSI::Modified))
-            src_tile->state(MOSI::Shared);
+        src_tile->state(MOSI::Shared); // src was either shared or modified
     }
     if (modify) {
         tileModified(i, j, dst_device);
diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh
index 3257f8004..b2d8fb98c 100644
--- a/include/slate/internal/MatrixStorage.hh
+++ b/include/slate/internal/MatrixStorage.hh
@@ -107,10 +107,11 @@ public:
     void eraseOn(int device)
     {
         slate_assert(device >= -1 && device+1 < int(tiles_.size()));
-        if (tiles_[device+1] != nullptr) {
-            tiles_[device+1]->state(MOSI::Invalid);
-            delete tiles_[device+1];
-            tiles_[device+1] = nullptr;
+        auto tile = tiles_[device+1];
+        tiles_[device+1] = nullptr;
+        if (tile != nullptr) {
+            assert(!tile->stateOn(MOSI::OnHold));
+            delete tile;
             --num_instances_;
         }
     }

From 63bd0ed37d923df0375b20df6c8ceddf12019008 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 13 Nov 2023 13:28:24 -0500
Subject: [PATCH 03/19] Add nonuniform block size to GEMM tester

---
 test/matrix_utils.hh |  14 +++-
 test/test_gemm.cc    | 160 +++++++++++++++++--------------------------
 test/test_gesv.cc    |  38 +++++-----
 3 files changed, 93 insertions(+), 119 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 1ff2df78d..aab4d9469 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -8,6 +8,8 @@
 
 #include "slate/slate.hh"
 
+#include "scalapack_wrappers.hh"
+
 //------------------------------------------------------------------------------
 // Zero out B, then copy band matrix B from A.
 // B is stored as a non-symmetric matrix, so we can apply Q from left
@@ -221,7 +223,15 @@ public:
     std::vector<scalar_t> Aref_data;
 
     // ScaLAPACK configuration
-    int64_t mloc, nloc, lld, nb;
+    int64_t m, n, mloc, nloc, lld, nb;
+
+    #ifdef SLATE_HAVE_SCALAPACK
+    void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) {
+        int64_t info;
+        scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info);
+        slate_assert(info == 0);
+    }
+    #endif
 };
 
 // -----------------------------------------------------------------------------
@@ -272,6 +282,8 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
 
     // The object to be returned
     TestMatrix<slate::Matrix<scalar_t>> matrix;
+    matrix.m = m;
+    matrix.n = n;
 
     // ScaLAPACK variables
     int mpi_rank, myrow, mycol;
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 8af202539..f9f1c18fe 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -8,6 +8,7 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
 
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
@@ -43,7 +44,6 @@ void test_gemm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
     int64_t k = params.dim.k();
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     int64_t lookahead = params.lookahead();
@@ -52,15 +52,18 @@ void test_gemm_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::Origin origin = params.origin();
     slate::GridOrder grid_order = params.grid_order();
     slate::Method method_gemm = params.method_gemm();
     params.matrix.mark();
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -74,6 +77,15 @@ void test_gemm_work(Params& params, bool run)
     if (! run)
         return;
 
+    #ifndef SLATE_HAVE_SCALAPACK
+        // Can only run ref when we have ScaLAPACK
+        if (ref) {
+            if (mpi_rank == 0)
+                printf( "ScaLAPACK not available\n" );
+            ref = false;
+        }
+    #endif
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -91,77 +103,23 @@ void test_gemm_work(Params& params, bool run)
     int64_t Cm = m;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        // todo: C_data only if origin == ScaLAPACK?
-        C_data.resize( lldC * nlocC );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, Cm, Cn, params );
 
-    slate::Matrix<scalar_t> A, B, C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(
-            Am, An, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        A.insertLocalTiles( origin_target );
-
-        B = slate::Matrix<scalar_t>(
-            Bm, Bn, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        B.insertLocalTiles( origin_target );
-
-        C = slate::Matrix<scalar_t>(
-            Cm, Cn, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        C.insertLocalTiles( origin_target );
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-            Am, An, &A_data[0], lldA, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-            Bm, Bn, &B_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-        C = slate::Matrix<scalar_t>::fromScaLAPACK(
-            Cm, Cn, &C_data[0], lldC, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix(params.matrix, A);
     slate::generate_matrix(params.matrixB, B);
     slate::generate_matrix(params.matrixC, C);
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        std::vector<scalar_t> Cref_data;
-        slate::Matrix<scalar_t> Cref;
-        if (ref) {
-            // For simplicity, always use ScaLAPACK format for ref matrices.
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       m,  n, &Cref_data[0], lldC, nb, nb, grid_order, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
+    // if reference run is required, copy test data.
+    if (ref) {
+        slate::copy( C, Cref );
+    }
 
     if (transA == slate::Op::Trans)
         A = transpose(A);
@@ -177,29 +135,29 @@ void test_gemm_work(Params& params, bool run)
     slate_assert(B.nt() == C.nt());
     slate_assert(A.nt() == B.mt());
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, record norms to be used in the check/ref.
-        real_t A_norm=0, B_norm=0, C_orig_norm=0;
-        if (ref) {
-            A_norm = slate::norm(norm, A);
-            B_norm = slate::norm(norm, B);
-            C_orig_norm = slate::norm(norm, Cref);
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
         // Compute Y = alpha A * (B * X) + (beta C * X).
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( k, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, k, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Z = B * X;
         slate::multiply( one, B, X, zero, Z, opts );
@@ -246,6 +204,9 @@ void test_gemm_work(Params& params, bool run)
     }
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -263,36 +224,46 @@ void test_gemm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
+            if (nonuniform_nb) {
+                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+                return;
+            }
 
             // BLACS/MPI variables
+            int mpi_rank, myrow, mycol;
             blas_int ictxt, p_, q_, myrow_, mycol_;
             blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
             blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
             Cblacs_pinfo(&mpi_rank_, &nprocs);
             slate_assert( mpi_rank == mpi_rank_ );
             slate_assert(p*q <= nprocs);
             Cblacs_get(-1, 0, &ictxt);
             Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q );
+            gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
             Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
             slate_assert( p == p_ );
             slate_assert( q == q_ );
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
 
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
                 copy(A, &A_data[0], A_desc);
                 copy(B, &B_data[0], B_desc);
@@ -338,9 +309,6 @@ void test_gemm_work(Params& params, bool run)
 
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
-        #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
-                printf( "ScaLAPACK not available\n" );
         #endif
     }
 }
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index d25240d01..19408010f 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -183,23 +183,20 @@ void test_gesv_work(Params& params, bool run)
 
     int64_t info = 0;
 
-    auto A_allocation = allocate_test_Matrix<scalar_t>( check || ref,
-                                                        m, n, params );
-    auto B_allocation = allocate_test_Matrix<scalar_t>( check || ref,
-                                                        n, nrhs, params );
-    TestMatrix<slate::Matrix<scalar_t>> X_allocation;
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, m, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, n, nrhs, params );
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc;
     if (is_iterative) {
-        X_allocation = allocate_test_Matrix<scalar_t>( false,
-                                                       n, nrhs, params );
+        X_alloc = allocate_test_Matrix<scalar_t>( false, n, nrhs, params );
     }
 
-    auto& A         = A_allocation.A;
-    auto& Aref      = A_allocation.Aref;
-    auto& Aref_data = A_allocation.Aref_data;
-    auto& B         = B_allocation.A;
-    auto& Bref      = B_allocation.Aref;
-    auto& Bref_data = B_allocation.Aref_data;
-    auto& X         = X_allocation.A;
+    auto& A         = A_alloc.A;
+    auto& Aref      = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+    auto& B         = B_alloc.A;
+    auto& Bref      = B_alloc.Aref;
+    auto& Bref_data = B_alloc.Aref_data;
+    auto& X         = X_alloc.A;
 
     slate::Pivots pivots;
 
@@ -412,19 +409,16 @@ void test_gesv_work(Params& params, bool run)
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int nb = A_allocation.nb;
+            int nb = A_alloc.nb;
 
             // ScaLAPACK descriptor for the reference matrix
-            blas_int Aref_desc[9];
-            scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, A_allocation.mloc, &info);
-            slate_assert(info == 0);
+            blas_int Aref_desc[9], Bref_desc[9];
 
-            blas_int Bref_desc[9];
-            scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, B_allocation.mloc, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
             // ScaLAPACK data for pivots.
-            std::vector<blas_int> ipiv_ref(A_allocation.lld + nb);
+            std::vector<blas_int> ipiv_ref(A_alloc.lld + nb);
 
             if (params.routine == "getrs") {
                 // Factor matrix A.

From 87e1558252da993d25695817ad3bca7487ec75e5 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 08:50:59 -0500
Subject: [PATCH 04/19] Add nonuniform tests to set

---
 test/matrix_utils.hh |   8 +--
 test/test_add.cc     | 132 -------------------------------------------
 test/test_gemm.cc    |  12 ++--
 test/test_gesv.cc    |   6 +-
 test/test_set.cc     |  71 +++++++----------------
 test/test_utils.hh   | 111 ++++++++++++++++++++++++++++++++++++
 6 files changed, 145 insertions(+), 195 deletions(-)
 create mode 100644 test/test_utils.hh

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index aab4d9469..fbc3c2a22 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -249,8 +249,8 @@ inline void mark_params_for_test_Matrix(Params& params)
 // -----------------------------------------------------------------------------
 /// Allocates a Matrix<scalar_t> and a reference version for testing.
 ///
-/// @param run[in]
-///     Whether to actaully allocate the matrix, instead of just marking params
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
 ///
 /// @param nonuniform_ref[in]
 ///     If params.nonuniform_nb(), whether to also allocate the reference matrix
@@ -268,6 +268,7 @@ inline void mark_params_for_test_Matrix(Params& params)
 template <typename scalar_t>
 TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         bool ref_matrix,
+        bool nonuniform_ref,
         int64_t m,
         int64_t n,
         Params& params)
@@ -310,7 +311,6 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         // SLATE allocates CPU or GPU tiles.
         slate::Target origin_target = origin2target( origin );
         if (nonuniform_nb) {
-
             matrix.A = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
                                                 tileDevice, MPI_COMM_WORLD);
         }
@@ -331,7 +331,7 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
 
     // Setup reference matrix
     if (ref_matrix) {
-        if (nonuniform_nb) {
+        if (nonuniform_nb && nonuniform_ref) {
             matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
                                                    tileDevice, MPI_COMM_WORLD );
             matrix.Aref.insertLocalTiles( slate::Target::Host );
diff --git a/test/test_add.cc b/test/test_add.cc
index 90a8ab6a1..9f8703500 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -18,138 +18,6 @@
 #include <cstdlib>
 #include <utility>
 
-// -----------------------------------------------------------------------------
-// subtract_matrix takes input matrices A and B,
-// assumed to be of the same dimension, and performs the operation B = A - B.
-// This was developed for checking slate::add without using slate::add to check.
-// It is a CPU-only implementation and assumes column-major.
-
-template <typename matrix_type>
-void subtract_matrix( matrix_type& A, matrix_type& B )
-{
-    using scalar_t = typename matrix_type::value_type;
-    int64_t mt = A.mt();
-    int64_t nt = A.nt();
-    if constexpr (std::is_same<matrix_type, slate::Matrix<scalar_t>>::value) {
-        #pragma omp parallel for collapse(2)
-        for (int64_t j = 0; j < nt; ++j) {
-            for (int64_t i = 0; i < mt; ++i) {
-                if (A.tileIsLocal( i, j )) {
-                    A.tileGetForReading( i, j, slate::LayoutConvert::None );
-                    B.tileGetForWriting( i, j, slate::LayoutConvert::None );
-                    auto TA = A( i, j );
-                    auto TB = B( i, j );
-                    int64_t mb = TA.mb();
-                    int64_t nb = TA.nb();
-                    int64_t lda = TA.stride();
-                    int64_t ldb = TB.stride();
-                    scalar_t const* TA_data = TA.data();
-                    scalar_t*       TB_data = TB.data();
-                    for (int64_t jj = 0; jj < nb; ++jj) {
-                        for (int64_t ii = 0; ii < mb; ++ii) {
-                            TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ];
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else if (A.uploPhysical() == slate::Uplo::Upper) {
-        // todo: collapse(2) here requires OpenMP 5
-        #pragma omp parallel for
-        for (int64_t i = 0; i < mt; ++i) {
-            for (int64_t j = i; j < nt; ++j) {
-                if (A.tileIsLocal( i, j )) {
-                    auto Aij = A(i, j);
-                    if (Aij.uploPhysical() == slate::Uplo::Upper) {
-                        // Diagonal tiles are upper.
-                        A.tileGetForReading( i, j, slate::LayoutConvert::None );
-                        B.tileGetForWriting( i, j, slate::LayoutConvert::None );
-                        auto TA = A( i, j );
-                        auto TB = B( i, j );
-                        int64_t mb = TA.mb();
-                        int64_t nb = TA.nb();
-                        int64_t lda = TA.stride();
-                        int64_t ldb = TB.stride();
-                        scalar_t const* TA_data = TA.data();
-                        scalar_t*       TB_data = TB.data();
-                        for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = 0; ii <= jj && ii < mb; ++ii) { // upper
-                                TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ];
-                            }
-                        }
-                    }
-                    else {
-                        // Off-diagonal tiles are full.
-                        A.tileGetForReading( i, j, slate::LayoutConvert::None );
-                        B.tileGetForWriting( i, j, slate::LayoutConvert::None );
-                        auto TA = A( i, j );
-                        auto TB = B( i, j );
-                        int64_t mb = TA.mb();
-                        int64_t nb = TA.nb();
-                        int64_t lda = TA.stride();
-                        int64_t ldb = TB.stride();
-                        scalar_t const* TA_data = TA.data();
-                        scalar_t*       TB_data = TB.data();
-                        for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = 0; ii < mb; ++ii) {
-                                TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else if (A.uploPhysical() == slate::Uplo::Lower) {
-        // todo: collapse(2) here requires OpenMP 5
-        #pragma omp parallel for
-        for (int64_t j = 0; j < nt; ++j) {
-            for (int64_t i = j; i < mt; ++i) {
-                if (A.tileIsLocal( i, j )) {
-                    auto Aij = A(i, j);
-                    if (Aij.uploPhysical() == slate::Uplo::Lower) {
-                        // Diagonal tiles are lower.
-                        A.tileGetForReading( i, j, slate::LayoutConvert::None );
-                        B.tileGetForWriting( i, j, slate::LayoutConvert::None );
-                        auto TA = A( i, j );
-                        auto TB = B( i, j );
-                        int64_t mb = TA.mb();
-                        int64_t nb = TA.nb();
-                        int64_t lda = TA.stride();
-                        int64_t ldb = TB.stride();
-                        scalar_t const* TA_data = TA.data();
-                        scalar_t*       TB_data = TB.data();
-                        for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = jj; ii < mb; ++ii) { // lower
-                                TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ];
-                            }
-                        }
-                    }
-                    else {
-                        // Off-diagonal tiles are full.
-                        A.tileGetForReading( i, j, slate::LayoutConvert::None );
-                        B.tileGetForWriting( i, j, slate::LayoutConvert::None );
-                        auto TA = A( i, j );
-                        auto TB = B( i, j );
-                        int64_t mb = TA.mb();
-                        int64_t nb = TA.nb();
-                        int64_t lda = TA.stride();
-                        int64_t ldb = TB.stride();
-                        scalar_t const* TA_data = TA.data();
-                        scalar_t*       TB_data = TB.data();
-                        for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = 0; ii < mb; ++ii) {
-                                TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 template <typename matrix_type>
 void test_add_work(Params& params, bool run)
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index f9f1c18fe..659186610 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -103,9 +103,9 @@ void test_gemm_work(Params& params, bool run)
     int64_t Cm = m;
     int64_t Cn = n;
 
-    auto A_alloc = allocate_test_Matrix<scalar_t>( false, Am, An, params );
-    auto B_alloc = allocate_test_Matrix<scalar_t>( false, Bm, Bn, params );
-    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, Cm, Cn, params );
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
 
     auto& A         = A_alloc.A;
     auto& B         = B_alloc.A;
@@ -147,9 +147,9 @@ void test_gemm_work(Params& params, bool run)
     TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
         // Compute Y = alpha A * (B * X) + (beta C * X).
-        X_alloc = allocate_test_Matrix<scalar_t>( false, n, nrhs, params );
-        Y_alloc = allocate_test_Matrix<scalar_t>( false, m, nrhs, params );
-        Z_alloc = allocate_test_Matrix<scalar_t>( false, k, nrhs, params );
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, k, nrhs, params );
 
         auto& X = X_alloc.A;
         auto& Y = Y_alloc.A;
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index 19408010f..ac5009036 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -183,11 +183,11 @@ void test_gesv_work(Params& params, bool run)
 
     int64_t info = 0;
 
-    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, m, n, params );
-    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, n, nrhs, params );
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, m, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, n, nrhs, params );
     TestMatrix<slate::Matrix<scalar_t>> X_alloc;
     if (is_iterative) {
-        X_alloc = allocate_test_Matrix<scalar_t>( false, n, nrhs, params );
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
     }
 
     auto& A         = A_alloc.A;
diff --git a/test/test_set.cc b/test/test_set.cc
index bfd4bf8c4..68c2790c3 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -12,6 +12,7 @@
 #include "print_matrix.hh"
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -28,9 +29,6 @@ void test_set_work(Params& params, bool run)
     using blas::imag;
     using slate::ceildiv;
 
-    // Constants
-    const scalar_t one = 1.0;
-
     // get & mark input values
     slate::Uplo uplo;
     if (std::is_same< matrix_type, slate::Matrix<scalar_t> >::value)
@@ -51,17 +49,17 @@ void test_set_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -73,49 +71,21 @@ void test_set_work(Params& params, bool run)
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data;
-
-    slate::Matrix<scalar_t> Afull;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        Afull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Afull.insertLocalTiles( origin_target );
-    }
-    else {
-        // Allocate ScaLAPACK data.
-        A_data.resize( lldA*nlocA );
-        // Create SLATE matrix from the ScaLAPACK layout.
-        Afull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-    }
-    slate::generate_matrix( params.matrix, Afull );
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
 
-    // Cast to desired matrix type.
-    matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag );
+    auto& Afull     = A_alloc.A;
+    auto& Aref_full = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+
+    slate::generate_matrix( params.matrix, Afull );
 
-    // if reference run is required, copy test data
-    std::vector<scalar_t> Aref_data;
-    slate::Matrix<scalar_t> Aref_full;
-    matrix_type Aref;
     if (check || ref) {
-        // For simplicity, always use ScaLAPACK format for ref matrices.
-        Aref_data.resize( lldA*nlocA );
-        Aref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Aref = matrix_cast< matrix_type >( Afull, uplo, diag );
-        slate::copy( Afull, Aref_full );
+        copy_matrix( Afull, Aref_full );
     }
 
+    // Cast to desired matrix type.
+    matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag );
+
     if (trans == slate::Op::Trans)
         A = transpose( A );
     else if (trans == slate::Op::ConjTrans)
@@ -153,25 +123,26 @@ void test_set_work(Params& params, bool run)
             // comparison with reference routine from ScaLAPACK
 
             // BLACS/MPI variables
+            int mpi_rank, myrow, mycol;
             blas_int ictxt, p_, q_, myrow_, mycol_;
             blas_int A_desc[9];
             blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
             Cblacs_pinfo(&mpi_rank_, &nprocs);
             slate_assert( mpi_rank_ == mpi_rank );
             slate_assert(p*q <= nprocs);
             Cblacs_get(-1, 0, &ictxt);
             Cblacs_gridinit(&ictxt, "Col", p, q);
+            gridinfo(mpi_rank, p, q, &myrow, &mycol);
             Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
             slate_assert( p == p_ );
             slate_assert( q == q_ );
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::One, A );
 
@@ -182,6 +153,7 @@ void test_set_work(Params& params, bool run)
             //==================================================
             double time = barrier_get_wtime(MPI_COMM_WORLD);
 
+            blas_int info;
             scalapack_plaset( uplo2str(uplo), m, n, alpha, beta,
                               &Aref_data[0], 1, 1, A_desc );
             slate_assert(info == 0);
@@ -195,10 +167,10 @@ void test_set_work(Params& params, bool run)
             // Do this on full m-by-n matrix to detect if on, say,
             // a lower triangular matrix, the kernel accidentally modifies
             // the upper triangle.
-            slate::add( -one, Aref_full, one, Afull );
-            real_t A_diff_norm = slate::norm( slate::Norm::One, Afull );
+            subtract_matrices( Afull, Aref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full );
 
-            print_matrix( "A_diff_full", Afull, params );
+            print_matrix( "A_diff_full", Aref_full, params );
 
             real_t error = A_diff_norm / (n * A_norm);
             params.error() = error;
@@ -207,7 +179,6 @@ void test_set_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            SLATE_UNUSED( one );
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
diff --git a/test/test_utils.hh b/test/test_utils.hh
new file mode 100644
index 000000000..8b4f39fd6
--- /dev/null
+++ b/test/test_utils.hh
@@ -0,0 +1,111 @@
+// Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#ifndef SLATE_TEST_UTILS_HH
+#define SLATE_TEST_UTILS_HH
+
+#include "slate/slate.hh"
+
+template <typename matrix_type>
+void matrix_iterator( matrix_type& A, matrix_type& B,
+                      std::function<void(const typename matrix_type::value_type&,
+                                         typename matrix_type::value_type&)> thunk )
+{
+    using scalar_t = typename matrix_type::value_type;
+    assert( A.m() == B.m() );
+    assert( A.n() == B.n() );
+
+    int64_t B_mt = B.mt();
+    int64_t B_nt = B.nt();
+    if constexpr (std::is_same<matrix_type, slate::Matrix<scalar_t>>::value) {
+
+        int64_t A_j = 0, A_jj = 0;
+        for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
+
+            int64_t A_i = 0, A_ii = 0;
+            for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+                #pragma omp task shared(A, B) \
+                                 firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
+                {
+                    if (B.tileIsLocal( B_i, B_j )) {
+                        A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
+                                    slate::Layout::ColMajor );
+
+                        A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor );
+                        B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor );
+                        auto TA = A( A_i, A_j );
+                        auto TB = B( B_i, B_j );
+                        int64_t mb = TB.mb();
+                        int64_t nb = TB.nb();
+                        assert( A_ii + mb <= TA.mb() );
+                        assert( A_jj + nb <= TA.nb() );
+                        int64_t lda = TA.stride();
+                        int64_t ldb = TB.stride();
+                        scalar_t const* TA_data = TA.data();
+                        scalar_t*       TB_data = TB.data();
+                        for (int64_t jj = 0; jj < nb; ++jj) {
+                            for (int64_t ii = 0; ii < mb; ++ii) {
+                                thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ],
+                                       TB_data[ ii + jj*ldb ] );
+                            }
+                        }
+                    }
+                    else if (A.tileIsLocal( A_i, A_j )) {
+                        A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) );
+                    }
+                }
+
+                A_ii += B.tileMb( B_i );
+                assert( A_ii <= A.tileMb( A_i ) );
+                if (A_ii == A.tileMb( A_i )) {
+                    ++A_i;
+                    A_ii = 0;
+                }
+            }
+            A_jj += B.tileNb( B_j );
+            assert( A_jj <= A.tileNb( A_j ) );
+            if (A_jj == A.tileNb( A_j )) {
+                ++A_j;
+                A_jj = 0;
+            }
+        }
+    }
+    else {
+        assert( false );
+    }
+
+    A.releaseRemoteWorkspace();
+}
+
+// -----------------------------------------------------------------------------
+// subtract_matrices takes input matrices A and B, and performs B = B - A.
+// The matrices must have the same size, but can have different tile sizes and
+// distributions.  However, the elements of a tile of B must all belong to the
+// same tile of A
+
+template <typename matrix_type>
+void subtract_matrices( matrix_type& A, matrix_type& B )
+{
+    using scalar_t = typename matrix_type::value_type;
+
+    matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } );
+}
+
+// -----------------------------------------------------------------------------
+// copy_matrix copies A to B
+// The matrices must have the same size, but can have different tile sizes and
+// distributions.  However, the elements of a tile of B must all belong to the
+// same tile of A
+
+template <typename matrix_type>
+void copy_matrix( matrix_type& A, matrix_type& B )
+{
+    using scalar_t = typename matrix_type::value_type;
+
+    matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b = a; } );
+}
+
+
+#endif // SLATE_TEST_UTILS_HH

From 9ff51bf99f8fc3eecd727747c5f369cd9b638cb1 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 09:00:01 -0500
Subject: [PATCH 05/19] Add nonuniform-nb to run_test and tweak name in tester

---
 test/run_tests.py | 36 +++++++++++++++++++-----------------
 test/test.cc      |  2 +-
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index 0dd06e3e1..a80751fa8 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -123,6 +123,7 @@
 group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' )
 group_opt.add_argument( '--dev-dist',  action='store', help='default=%(default)s', default='c,r' )
 group_opt.add_argument( '--nb',     action='store', help='default=%(default)s', default='64,100' )
+group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' )
 group_opt.add_argument( '--nt',     action='store', help='default=%(default)s', default='5,10,20' )
 group_opt.add_argument( '--np',     action='store', help='number of MPI processes; default=%(default)s', default='1' )
 group_opt.add_argument( '--grid',   action='store', help='use p-by-q MPI process grid', default='' )
@@ -298,6 +299,7 @@
 la     = ' --lookahead ' + opts.lookahead if (opts.lookahead) else ''
 ddist  = ' --dev-dist  ' + opts.dev_dist  if (opts.dev_dist)  else ''
 nb     = ' --nb '     + opts.nb     if (opts.nb)     else ''
+nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else ''
 nt     = ' --nt '     + opts.nt     if (opts.nt)     else ''
 grid   = ' --grid '   + opts.grid   if (opts.grid)   else ''
 repeat = ' --repeat ' + opts.repeat if (opts.repeat) else ''
@@ -346,9 +348,9 @@ def filter_csv( values, csv ):
     cmds += [
     [ 'gbmm',  gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ],
 
-    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC ],
-    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC ],
-    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC ],
+    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
+    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
+    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
 
     [ 'hemm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
@@ -384,21 +386,21 @@ def filter_csv( values, csv ):
 # LU
 if (opts.lu):
     cmds += [
-    [ 'gesv',         gen + dtype + la + n + thresh ],
+    [ 'gesv',         gen + dtype + la + n + thresh + nonuniform_nb ],
     [ 'gesv_tntpiv',  gen + dtype + la + n ],
-    [ 'gesv_nopiv',   gen + dtype + la + n
-                      + ' --matrix rand_dominant --nonuniform_nb n' ],
+    [ 'gesv_nopiv',   gen + dtype + la + n + nonuniform_nb
+                      + ' --matrix rand_dominant' ],
 
     # todo: mn
-    [ 'getrf',        gen + dtype + la + n + thresh ],
+    [ 'getrf',        gen + dtype + la + n + thresh + nonuniform_nb ],
     [ 'getrf_tntpiv', gen + dtype + la + n ],
-    [ 'getrf_nopiv',  gen + dtype + la + n
-                      + ' --matrix rand_dominant --nonuniform_nb n' ],
+    [ 'getrf_nopiv',  gen + dtype + la + n + nonuniform_nb
+                      + ' --matrix rand_dominant' ],
 
-    [ 'getrs',        gen + dtype + la + n + trans + thresh ],
+    [ 'getrs',        gen + dtype + la + n + trans + thresh + nonuniform_nb ],
     [ 'getrs_tntpiv', gen + dtype + la + n + trans ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + trans
-                      + ' --matrix rand_dominant --nonuniform_nb n' ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + trans + nonuniform_nb
+                      + ' --matrix rand_dominant' ],
 
     [ 'getri',    gen + dtype + la + n ],
     [ 'getriOOP', gen + dtype + la + n ],
@@ -646,11 +648,11 @@ def filter_csv( values, csv ):
 
     [ 'scale_row_col', gen + dtype + mn + equed ],
 
-    [ 'set',    gen + dtype + mn + ab        ],
-    [ 'tzset',  gen + dtype + mn + ab + uplo ],
-    [ 'trset',  gen + dtype +  n + ab + uplo ],
-    [ 'syset',  gen + dtype +  n + ab + uplo ],
-    [ 'heset',  gen + dtype +  n + ab + uplo ],
+    [ 'set',    gen + dtype + mn + ab + nonuniform_nb        ],
+    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
+    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
+    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
+    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
     ]
 
 # ------------------------------------------------------------------------------
diff --git a/test/test.cc b/test/test.cc
index ebf397f78..078df162a 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -403,7 +403,7 @@ Params::Params():
     panel_threads("pt",   2,    ParamType::List, std::max( omp_get_max_threads() / 2, 1 ),
                                                           0, 1000000, "(pt) max number of threads used in panel; default omp_num_threads / 2"),
     align     ("align",   5,    ParamType::List,  32,     1,    1024, "column alignment (sets lda, ldb, etc. to multiple of align)"),
-    nonuniform_nb("nonuniform_nb",
+    nonuniform_nb("nonuniform-nb",
                           0,    ParamType::List, 'n', "ny", "generate matrix with nonuniform tile sizes"),
     debug     ("debug",   0,    ParamType::Value, -1,     0, 1000000,
                "given rank waits for debugger (gdb/lldb) to attach"),

From 59b604ca3584929ab4b281f72f4fb264a21d7d3f Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 09:30:24 -0500
Subject: [PATCH 06/19] Reduce layout conversions in getrf and getrf_tntpiv

This seemed to be causing some issues for getrf w/ nonuniform tiles
---
 src/getrf.cc        |  9 ++++-----
 src/getrf_tntpiv.cc | 12 ++++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/getrf.cc b/src/getrf.cc
index cca880e4d..b9c2649ed 100644
--- a/src/getrf.cc
+++ b/src/getrf.cc
@@ -107,7 +107,7 @@ int64_t getrf(
                     bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}});
                 }
                 A.template listBcast<target>(
-                    bcast_list_A, Layout::ColMajor, tag_k );
+                    bcast_list_A, target_layout, tag_k );
 
                 // Root broadcasts the pivot to all ranks.
                 // todo: Panel ranks send the pivots to the right.
@@ -139,7 +139,7 @@ int64_t getrf(
                     internal::trsm<target>(
                         Side::Left,
                         one, std::move( Tkk ), A.sub(k, k, j, j),
-                        priority_1, Layout::ColMajor, queue_jk1 );
+                        priority_1, target_layout, queue_jk1 );
 
                     // send A(k, j) across column A(k+1:mt-1, j)
                     // todo: trsm still operates in ColMajor
@@ -196,7 +196,7 @@ int64_t getrf(
                         Side::Left,
                         one, std::move( Tkk ),
                              A.sub(k, k, k+1+lookahead, A_nt-1),
-                        priority_0, Layout::ColMajor, queue_1 );
+                        priority_0, target_layout, queue_1 );
 
                     // send A(k, kl+1:A_nt-1) across A(k+1:mt-1, kl+1:nt-1)
                     BcastList bcast_list_A;
@@ -204,9 +204,8 @@ int64_t getrf(
                         // send A(k, j) across column A(k+1:mt-1, j)
                         bcast_list_A.push_back({k, j, {A.sub(k+1, A_mt-1, j, j)}});
                     }
-                    // todo: trsm still operates in ColMajor
                     A.template listBcast<target>(
-                        bcast_list_A, Layout::ColMajor, tag_kl1);
+                        bcast_list_A, target_layout, tag_kl1);
 
                     // A(k+1:mt-1, kl+1:nt-1) -= A(k+1:mt-1, k) * A(k, kl+1:nt-1)
                     internal::gemm<target>(
diff --git a/src/getrf_tntpiv.cc b/src/getrf_tntpiv.cc
index c7c4b7892..20b88b098 100644
--- a/src/getrf_tntpiv.cc
+++ b/src/getrf_tntpiv.cc
@@ -223,7 +223,7 @@ int64_t getrf_tntpiv(
                     Side::Right,
                     one, std::move(Tkk),
                          A.sub( k+1, A_mt-1, k, k ),
-                    priority_1, Layout::ColMajor, queue_0 );
+                    priority_1, target_layout, queue_0 );
             }
 
             #pragma omp task depend(inout:column[k]) \
@@ -238,7 +238,7 @@ int64_t getrf_tntpiv(
                     bcast_list.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}, tag});
                 }
                 A.template listBcastMT<target>(
-                    bcast_list, Layout::ColMajor );
+                    bcast_list, target_layout );
             }
 
             // update lookahead column(s), high priority
@@ -262,13 +262,13 @@ int64_t getrf_tntpiv(
                     internal::trsm<target>(
                         Side::Left,
                         one, std::move( Tkk ), A.sub( k, k, j, j ),
-                        priority_1, Layout::ColMajor, queue_jk1 );
+                        priority_1, target_layout, queue_jk1 );
 
                     // send A(k, j) across column A(k+1:mt-1, j)
                     // todo: trsm still operates in ColMajor
                     A.tileBcast(
                         k, j, A.sub( k+1, A_mt-1, j, j ),
-                        Layout::ColMajor, tag_j );
+                        target_layout, tag_j );
                 }
 
                 #pragma omp task depend(in:column[k]) \
@@ -306,7 +306,7 @@ int64_t getrf_tntpiv(
                         Side::Left,
                         one, std::move( Tkk ),
                              A.sub( k, k, k+1+lookahead, A_nt-1 ),
-                        priority_0, Layout::ColMajor, queue_1 );
+                        priority_0, target_layout, queue_1 );
                 }
 
                 #pragma omp task depend(inout:column[k+1+lookahead]) \
@@ -322,7 +322,7 @@ int64_t getrf_tntpiv(
                         bcast_list.push_back({k, j, {A.sub(k+1, A_mt-1, j, j)}, tag});
                     }
                     A.template listBcastMT<target>(
-                        bcast_list, Layout::ColMajor);
+                        bcast_list, target_layout);
 
                 }
 

From c9c04779f53e191051624a3a3b11c2cc5b638a33 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 12:21:04 -0500
Subject: [PATCH 07/19] Fix handling on internal workspaces

---
 include/slate/BaseMatrix.hh             | 3 ++-
 include/slate/internal/MatrixStorage.hh | 9 ++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh
index d263a3bd7..cb48b5147 100644
--- a/include/slate/BaseMatrix.hh
+++ b/include/slate/BaseMatrix.hh
@@ -3487,6 +3487,7 @@ void BaseMatrix<scalar_t>::tileLayoutConvert(
             batch_count =
                 std::max(batch_count, int64_t(bucket->second.first.size()));
         }
+        allocateBatchArrays( batch_count, 1 );
 
         lapack::Queue* queue = comm_queue(device);
 
@@ -3497,7 +3498,7 @@ void BaseMatrix<scalar_t>::tileLayoutConvert(
             batch_count = bucket->second.first.size();
 
             scalar_t** array_dev = this->array_device(device);
-            scalar_t** work_array_dev = this->array_device(device) + batch_count;
+            scalar_t** work_array_dev = array_dev + batch_count;
 
             assert(array_dev      != nullptr);
             assert(work_array_dev != nullptr);
diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh
index b2d8fb98c..5019bbaad 100644
--- a/include/slate/internal/MatrixStorage.hh
+++ b/include/slate/internal/MatrixStorage.hh
@@ -645,11 +645,10 @@ void MatrixStorage<scalar_t>::allocateBatchArrays(
                 // Free device arrays.
                 blas::device_free(array_dev_[i][device], *queue);
 
-                // Free queues.
-                delete compute_queues_[i][device];
-
-                // Allocate queues.
-                compute_queues_[ i ][ device ] = new lapack::Queue( device );
+                if (compute_queues_[ i ][ device ] == nullptr) {
+                    // Allocate queues.
+                    compute_queues_[ i ][ device ] = new lapack::Queue( device );
+                }
 
                 // Allocate host arrays;
                 array_host_[i][device]

From 6777f8a095c52d0891bcb3b5052dda997ea30465 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 13:20:49 -0500
Subject: [PATCH 08/19] Add nonuniform tiles to add testers

---
 test/test_add.cc | 86 ++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 61 deletions(-)

diff --git a/test/test_add.cc b/test/test_add.cc
index 9f8703500..b38228dc5 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -12,6 +12,7 @@
 #include "print_matrix.hh"
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -28,9 +29,6 @@ void test_add_work(Params& params, bool run)
     using blas::imag;
     using slate::ceildiv;
 
-    // Constants
-    const scalar_t one = 1.0;
-
     // get & mark input values
     slate::Uplo uplo;
     if (std::is_same< matrix_type, slate::Matrix<scalar_t> >::value)
@@ -51,17 +49,17 @@ void test_add_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -73,38 +71,16 @@ void test_add_work(Params& params, bool run)
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data, B_data;
-
-    int64_t nlocB = nlocA, lldB = lldA;
-
-    slate::Matrix<scalar_t> Afull, Bfull;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        Afull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Afull.insertLocalTiles( origin_target );
-        Bfull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Bfull.insertLocalTiles( origin_target );
-    }
-    else {
-        // Allocate ScaLAPACK data.
-        A_data.resize( lldA*nlocA );
-        B_data.resize( lldB*nlocB );
-        // Create SLATE matrix from the ScaLAPACK layout.
-        Afull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-        Bfull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+
+    auto& Afull     = A_alloc.A;
+    auto& Bfull     = B_alloc.A;
+    auto& Aref_full = A_alloc.Aref;
+    auto& Bref_full = B_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+    auto& Bref_data = B_alloc.Aref_data;
+
     slate::generate_matrix( params.matrix, Afull );
     slate::generate_matrix( params.matrix, Bfull );
 
@@ -113,22 +89,9 @@ void test_add_work(Params& params, bool run)
     matrix_type B = matrix_cast< matrix_type >( Bfull, uplo, diag );
 
     // if reference run is required, copy test data
-    std::vector<scalar_t> Aref_data, Bref_data;
-    slate::Matrix<scalar_t> Aref_full, Bref_full;
-    matrix_type Aref, Bref;
     if (check || ref) {
-        // For simplicity, always use ScaLAPACK format for ref matrices.
-        Aref_data.resize( lldA*nlocA );
-        Aref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Aref = matrix_cast< matrix_type >( Afull, uplo, diag );
-        slate::copy( Afull, Aref_full );
-
-        Bref_data.resize( lldB*nlocB );
-        Bref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        Bref = matrix_cast< matrix_type >( Bfull, uplo, diag );
-        slate::copy( Bfull, Bref_full );
+        copy_matrix( Afull, Aref_full );
+        copy_matrix( Bfull, Bref_full );
     }
 
     //if (trans == slate::Op::Trans)
@@ -172,23 +135,24 @@ void test_add_work(Params& params, bool run)
             blas_int ictxt, p_, q_, myrow_, mycol_;
             blas_int A_desc[9], B_desc[9];
             blas_int mpi_rank_ = 0, nprocs = 1;
+            int mpi_rank, myrow, mycol;
 
             // initialize BLACS and ScaLAPACK
             Cblacs_pinfo(&mpi_rank_, &nprocs);
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
             slate_assert( mpi_rank_ == mpi_rank );
             slate_assert(p*q <= nprocs);
             Cblacs_get(-1, 0, &ictxt);
             Cblacs_gridinit(&ictxt, "Col", p, q);
+            gridinfo(mpi_rank, p, q, &myrow, &mycol);
             Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
             slate_assert( p == p_ );
             slate_assert( q == q_ );
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            scalapack_descinit(B_desc, m, n, nb, nb, 0, 0, ictxt, lldB, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
             real_t A_norm = slate::norm( slate::Norm::Max, A );
             real_t B_norm = slate::norm( slate::Norm::Max, B );
@@ -201,6 +165,7 @@ void test_add_work(Params& params, bool run)
             //==================================================
             double time = barrier_get_wtime(MPI_COMM_WORLD);
 
+            int64_t info;
             if (uplo == slate::Uplo::General) {
                 scalapack_pgeadd( op2str( trans ), m, n,
                                   alpha, &Aref_data[0], 1, 1, A_desc,
@@ -223,10 +188,10 @@ void test_add_work(Params& params, bool run)
             // Do this on full m-by-n matrix to detect if on, say,
             // a lower triangular matrix, the kernel accidentally modifies
             // the upper triangle.
-            slate::add( -one, Aref_full, one, Afull );
-            slate::add( -one, Bref_full, one, Bfull );
-            real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull );
-            real_t B_diff_norm = slate::norm( slate::Norm::Max, Bfull );
+            subtract_matrices( Afull, Aref_full );
+            subtract_matrices( Bfull, Bref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full );
+            real_t B_diff_norm = slate::norm( slate::Norm::One, Bref_full );
 
             print_matrix( "A_diff_full", Afull, params );
             print_matrix( "B_diff_full", Bfull, params );
@@ -242,7 +207,6 @@ void test_add_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            SLATE_UNUSED( one );
             SLATE_UNUSED( trans );
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );

From 931c887a0478836ede01863d5dfcb8060f4e292c Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 13:29:02 -0500
Subject: [PATCH 09/19] Add nonuniform tiles to scale testers

---
 test/test_scale.cc | 62 +++++++++++++---------------------------------
 1 file changed, 17 insertions(+), 45 deletions(-)

diff --git a/test/test_scale.cc b/test/test_scale.cc
index c1bfc985c..a26d66265 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -12,6 +12,7 @@
 #include "print_matrix.hh"
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -28,9 +29,6 @@ void test_scale_work(Params& params, bool run)
     using blas::imag;
     using slate::ceildiv;
 
-    // Constants
-    const scalar_t one = 1.0;
-
     // get & mark input values
     slate::Uplo uplo;
     if (std::is_same< matrix_type, slate::Matrix<scalar_t> >::value)
@@ -51,17 +49,17 @@ void test_scale_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -73,47 +71,20 @@ void test_scale_work(Params& params, bool run)
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data;
-
-    slate::Matrix<scalar_t> Afull;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        Afull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Afull.insertLocalTiles( origin_target );
-    }
-    else {
-        // Allocate ScaLAPACK data.
-        A_data.resize( lldA*nlocA );
-        // Create SLATE matrix from the ScaLAPACK layout.
-        Afull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+
+    auto& Afull     = A_alloc.A;
+    auto& Aref_full = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+
     slate::generate_matrix( params.matrix, Afull );
 
     // Cast to desired matrix type.
     matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag );
 
     // if reference run is required, copy test data
-    std::vector<scalar_t> Aref_data;
-    slate::Matrix<scalar_t> Aref_full;
-    matrix_type Aref;
     if (check || ref) {
-        // For simplicity, always use ScaLAPACK format for ref matrices.
-        Aref_data.resize( lldA*nlocA );
-        Aref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Aref = matrix_cast< matrix_type >( Afull, uplo, diag );
-        slate::copy( Afull, Aref_full );
+        copy_matrix( Afull, Aref_full );
     }
 
     //if (trans == slate::Op::Trans)
@@ -152,25 +123,26 @@ void test_scale_work(Params& params, bool run)
             // comparison with reference routine from ScaLAPACK
 
             // BLACS/MPI variables
+            int mpi_rank, myrow, mycol;
             blas_int ictxt, p_, q_, myrow_, mycol_;
             blas_int A_desc[9];
             blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
             Cblacs_pinfo(&mpi_rank_, &nprocs);
             slate_assert( mpi_rank_ == mpi_rank );
             slate_assert(p*q <= nprocs);
             Cblacs_get(-1, 0, &ictxt);
             Cblacs_gridinit(&ictxt, "Col", p, q);
+            gridinfo(mpi_rank, p, q, &myrow, &mycol);
             Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
             slate_assert( p == p_ );
             slate_assert( q == q_ );
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::Max, A );
 
@@ -181,6 +153,7 @@ void test_scale_work(Params& params, bool run)
             //==================================================
             double time = barrier_get_wtime(MPI_COMM_WORLD);
 
+            int64_t info;
             scalapack_plascl( uplo2str(uplo), alpha, beta, m, n,
                               &Aref_data[0], 1, 1, A_desc, &info );
             slate_assert(info == 0);
@@ -194,8 +167,8 @@ void test_scale_work(Params& params, bool run)
             // Do this on full m-by-n matrix to detect if on, say,
             // a lower triangular matrix, the kernel accidentally modifies
             // the upper triangle.
-            slate::add( -one, Aref_full, one, Afull );
-            real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull );
+            subtract_matrices( Afull, Aref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full );
 
             print_matrix( "A_diff_full", Afull, params );
 
@@ -208,7 +181,6 @@ void test_scale_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            SLATE_UNUSED( one );
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif

From 4a735e833fc7e5e962ebcb9dd0dad18a3867c27e Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 14:09:24 -0500
Subject: [PATCH 10/19] De-duplicate scalapack setup code

---
 test/grid_utils.hh | 31 +++++++++++++++++++++++++++++++
 test/test_add.cc   | 21 ++-------------------
 test/test_gemm.cc  | 20 ++------------------
 test/test_gesv.cc  | 27 +++------------------------
 test/test_scale.cc | 21 ++-------------------
 test/test_set.cc   | 21 ++-------------------
 6 files changed, 42 insertions(+), 99 deletions(-)

diff --git a/test/grid_utils.hh b/test/grid_utils.hh
index 945cddafc..b03a27a59 100644
--- a/test/grid_utils.hh
+++ b/test/grid_utils.hh
@@ -7,6 +7,7 @@
 #define SLATE_GRID_UTILS_HH
 
 #include "slate/slate.hh"
+#include "scalapack_wrappers.hh"
 
 #include <stdint.h>
 
@@ -56,4 +57,34 @@ inline void gridinfo(
     gridinfo( mpi_rank, slate::GridOrder::Col, p, q, my_row, my_col );
 }
 
+//------------------------------------------------------------------------------
+#ifdef SLATE_HAVE_SCALAPACK
+// Sets up a BLACS context with the given grid
+inline void create_ScaLAPACK_context( slate::GridOrder grid_order,
+                                      int p, int q,
+                                      blas_int* ictxt )
+{
+    // BLACS/MPI variables
+    int mpi_rank, myrow, mycol;
+    blas_int p_, q_, myrow_, mycol_;
+    blas_int mpi_rank_ = 0, nprocs = 1;
+
+    // initialize BLACS and ScaLAPACK
+    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+    Cblacs_pinfo( &mpi_rank_, &nprocs );
+    slate_assert( mpi_rank_ == mpi_rank );
+
+    Cblacs_get( -1, 0, ictxt );
+
+    slate_assert( p*q <= nprocs );
+    Cblacs_gridinit( ictxt, grid_order2str( grid_order ), p, q );
+    gridinfo( mpi_rank, p, q, &myrow, &mycol );
+    Cblacs_gridinfo( *ictxt, &p_, &q_, &myrow_, &mycol_ );
+    slate_assert( p == p_ );
+    slate_assert( q == q_ );
+    slate_assert( myrow == myrow_ );
+    slate_assert( mycol == mycol_ );
+}
+#endif // SLATE_HAVE_SCALAPACK
+
 #endif // SLATE_GRID_UTILS_HH
diff --git a/test/test_add.cc b/test/test_add.cc
index b38228dc5..b2d9fe6e7 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -131,26 +131,9 @@ void test_add_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-            int mpi_rank, myrow, mycol;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            gridinfo(mpi_rank, p, q, &myrow, &mycol);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
+            blas_int ictxt, A_desc[9], B_desc[9];
+            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 659186610..bd5425f72 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -229,25 +229,9 @@ void test_gemm_work(Params& params, bool run)
                 return;
             }
 
-            // BLACS/MPI variables
-            int mpi_rank, myrow, mycol;
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q );
-            gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
 
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index ac5009036..a19beadb8 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -390,35 +390,14 @@ void test_gesv_work(Params& params, bool run)
                 return;
             }
 
-            // MPI variables
-            int mpi_rank, myrow, mycol;
-            MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
-            gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-
-            // BLACS/MPI variables
-            blas_int ictxt, myrow_, mycol_, p_, q_;
-            blas_int mpi_rank_ = 0, nprocs = 1;
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q );
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int nb = A_alloc.nb;
-
-            // ScaLAPACK descriptor for the reference matrix
-            blas_int Aref_desc[9], Bref_desc[9];
-
+            blas_int ictxt, Aref_desc[9], Bref_desc[9];
+            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
             // ScaLAPACK data for pivots.
-            std::vector<blas_int> ipiv_ref(A_alloc.lld + nb);
+            std::vector<blas_int> ipiv_ref(A_alloc.lld + A_alloc.nb);
 
             if (params.routine == "getrs") {
                 // Factor matrix A.
diff --git a/test/test_scale.cc b/test/test_scale.cc
index a26d66265..a98ffcb2d 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -122,26 +122,9 @@ void test_scale_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            int mpi_rank, myrow, mycol;
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            gridinfo(mpi_rank, p, q, &myrow, &mycol);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
+            blas_int ictxt, A_desc[9];
+            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::Max, A );
diff --git a/test/test_set.cc b/test/test_set.cc
index 68c2790c3..9cb89e5e8 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -122,26 +122,9 @@ void test_set_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            int mpi_rank, myrow, mycol;
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            gridinfo(mpi_rank, p, q, &myrow, &mycol);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
+            blas_int ictxt, A_desc[9];
+            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::One, A );

From 50b8fd135cf25a1e81a23da71c1190b80bf81a76 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 14:30:02 -0500
Subject: [PATCH 11/19] Add nonuniform tiles to scale_row_col tester

---
 test/grid_utils.hh         |  2 +-
 test/test_scale_row_col.cc | 84 +++++++++-----------------------------
 2 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/test/grid_utils.hh b/test/grid_utils.hh
index b03a27a59..20ba6b9f6 100644
--- a/test/grid_utils.hh
+++ b/test/grid_utils.hh
@@ -78,7 +78,7 @@ inline void create_ScaLAPACK_context( slate::GridOrder grid_order,
 
     slate_assert( p*q <= nprocs );
     Cblacs_gridinit( ictxt, grid_order2str( grid_order ), p, q );
-    gridinfo( mpi_rank, p, q, &myrow, &mycol );
+    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
     Cblacs_gridinfo( *ictxt, &p_, &q_, &myrow_, &mycol_ );
     slate_assert( p == p_ );
     slate_assert( q == q_ );
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index 6c11bb3e7..163862391 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -12,6 +12,7 @@
 #include "print_matrix.hh"
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -29,9 +30,6 @@ void test_scale_row_col_work( Params& params, bool run )
     using slate::ceildiv;
     using slate::Equed;
 
-    // Constants
-    const scalar_t one = 1.0;
-
     // get & mark input values
     slate::Uplo uplo;
     if (std::is_same< matrix_type, slate::Matrix<scalar_t> >::value)
@@ -51,7 +49,6 @@ void test_scale_row_col_work( Params& params, bool run )
     else {
         n = params.dim.n();
     }
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
@@ -59,10 +56,11 @@ void test_scale_row_col_work( Params& params, bool run )
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
     int verbose = params.verbose();
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -74,47 +72,20 @@ void test_scale_row_col_work( Params& params, bool run )
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
-    gridinfo( mpi_rank, p, q, &myrow, &mycol );
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols( m, nb, myrow, p );
-    int64_t nlocA = num_local_rows_cols( n, nb, mycol, q );
-    int64_t lldA  = blas::max( 1, mlocA ); // local leading dimension of A
-    std::vector<scalar_t> A_data;
-
-    slate::Matrix<scalar_t> Afull;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        Afull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Afull.insertLocalTiles( origin_target );
-    }
-    else {
-        // Allocate ScaLAPACK data.
-        A_data.resize( lldA*nlocA );
-        // Create SLATE matrix from the ScaLAPACK layout.
-        Afull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+
+    auto& Afull     = A_alloc.A;
+    auto& Aref_full = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+
     slate::generate_matrix( params.matrix, Afull );
 
     // Cast to desired matrix type.
     matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag );
 
     // if reference run is required, copy test data
-    std::vector<scalar_t> Aref_data;
-    slate::Matrix<scalar_t> Aref_full;
-    matrix_type Aref;
     if (check || ref) {
-        // For simplicity, always use ScaLAPACK format for ref matrices.
-        Aref_data.resize( lldA*nlocA );
-        Aref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Aref = matrix_cast< matrix_type >( Afull, uplo, diag );
-        slate::copy( Afull, Aref_full );
+        copy_matrix( Afull, Aref_full );
     }
 
     if (trans == slate::Op::Trans)
@@ -165,30 +136,17 @@ void test_scale_row_col_work( Params& params, bool run )
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo( &mpi_rank_, &nprocs );
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert( p*q <= nprocs );
-            Cblacs_get( -1, 0, &ictxt );
-            Cblacs_gridinit( &ictxt, "Col", p, q );
-            Cblacs_gridinfo( ictxt, &p_, &q_, &myrow_, &mycol_ );
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit( A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info );
-            slate_assert( info == 0 );
+            blas_int ictxt, A_desc[9];
+            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_max = slate::norm( slate::Norm::Max, A );
 
-            std::vector<real_t> Rlocal( mlocA ), Clocal( nlocA );
+            std::vector<real_t> Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc );
+
+            int myrow, mycol;
+            gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol );
 
             // Copy local part of R.
             int64_t ii = 0, iilocal = 0;
@@ -235,7 +193,6 @@ void test_scale_row_col_work( Params& params, bool run )
             scalapack_plaqge( m, n, &Aref_data[0], 1, 1, A_desc,
                               Rlocal.data(), Clocal.data(),
                               rowcnd, colcnd, A_max, &equed_out );
-            slate_assert( info == 0 );
 
             time = barrier_get_wtime( MPI_COMM_WORLD ) - time;
             params.ref_time() = time;
@@ -248,8 +205,8 @@ void test_scale_row_col_work( Params& params, bool run )
             // Do this on full m-by-n matrix to detect if on, say,
             // a lower triangular matrix, the kernel accidentally modifies
             // the upper triangle.
-            slate::add( -one, Aref_full, one, Afull );
-            real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull );
+            subtract_matrices( Afull, Aref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full );
             print_matrix( "A_diff_full", Afull, params );
 
             int64_t i = blas::iamax( m, &R[ 0 ], 1 );
@@ -264,7 +221,7 @@ void test_scale_row_col_work( Params& params, bool run )
             real_t eps = std::numeric_limits<real_t>::epsilon();
             params.okay() = (error <= 3*eps);
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf( "%% A_diff_norm %.2e, A_max %.2e, R_max %.2e,"
                         " C_max %.2e, eps %.2e, 3*eps %.2e, error %.2e\n",
                         A_diff_norm, A_max, R_max, C_max, eps, 3*eps, error );
@@ -273,7 +230,6 @@ void test_scale_row_col_work( Params& params, bool run )
             Cblacs_gridexit( ictxt );
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            SLATE_UNUSED( one );
             SLATE_UNUSED( verbose );
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );

From 1dfa15d57204cce7d810c48973319a75ca8538cd Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 15:28:42 -0500
Subject: [PATCH 12/19] Add nonuniform tiles to copy tester

---
 test/test_copy.cc | 102 ++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 76 deletions(-)

diff --git a/test/test_copy.cc b/test/test_copy.cc
index 5760b9f62..7f19380a2 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -12,6 +12,7 @@
 #include "print_matrix.hh"
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -28,9 +29,6 @@ void test_copy_work(Params& params, bool run)
     using blas::imag;
     using slate::ceildiv;
 
-    // Constants
-    const scalar_t one = 1.0;
-
     // get & mark input values
     slate::Uplo uplo;
     if (std::is_same< matrix_type, slate::Matrix<scalar_t> >::value)
@@ -49,17 +47,17 @@ void test_copy_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t nb = params.nb();
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -76,34 +74,16 @@ void test_copy_work(Params& params, bool run)
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
     gridinfo(mpi_rank, p, q, &myrow, &mycol);
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data, B_data;
-    real_t A_norm, B_norm;
-
-    int64_t nlocB = nlocA, lldB = lldA;
-
-    slate::Matrix<scalar_t> Afull, Bfull;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        Afull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Afull.insertLocalTiles( origin_target );
-        Bfull = slate::Matrix<scalar_t>( m, n, nb, p, q, MPI_COMM_WORLD);
-        Bfull.insertLocalTiles( origin_target );
-    }
-    else {
-        // Allocate ScaLAPACK data.
-        A_data.resize( lldA*nlocA );
-        B_data.resize( lldB*nlocB );
-        // Create SLATE matrix from the ScaLAPACK layout.
-        Afull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-        Bfull = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
+
+    auto& Afull     = A_alloc.A;
+    auto& Bfull     = B_alloc.A;
+    auto& Aref_full = A_alloc.Aref;
+    auto& Bref_full = B_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+    auto& Bref_data = B_alloc.Aref_data;
+
     slate::generate_matrix( params.matrix, Afull );
     slate::generate_matrix( params.matrix, Bfull );
 
@@ -112,22 +92,9 @@ void test_copy_work(Params& params, bool run)
     matrix_type B = matrix_cast< matrix_type >( Bfull, uplo, diag );
 
     // if reference run is required, copy test data
-    std::vector<scalar_t> Aref_data, Bref_data;
-    slate::Matrix<scalar_t> Aref_full, Bref_full;
-    matrix_type Aref, Bref;
     if (check || ref) {
-        // For simplicity, always use ScaLAPACK format for ref matrices.
-        Aref_data.resize( lldA*nlocA );
-        Aref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Aref = matrix_cast< matrix_type >( Afull, uplo, diag );
-        slate::copy( Afull, Aref_full );
-
-        Bref_data.resize( lldB*nlocB );
-        Bref_full = slate::Matrix<scalar_t>::fromScaLAPACK(
-                        m,  n, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        Bref = matrix_cast< matrix_type >( Bfull, uplo, diag );
-        slate::copy( Bfull, Bref_full );
+        copy_matrix( Afull, Aref_full );
+        copy_matrix( Bfull, Bref_full );
     }
 
     //if (trans == slate::Op::Trans)
@@ -171,30 +138,14 @@ void test_copy_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            scalapack_descinit(B_desc, m, n, nb, nb, 0, 0, ictxt, lldB, &info);
-            slate_assert(info == 0);
-
-            A_norm = slate::norm( slate::Norm::Max, A );
-            B_norm = slate::norm( slate::Norm::Max, B );
+            blas_int ictxt, A_desc[9], B_desc[9];
+            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+
+            real_t A_norm = slate::norm( slate::Norm::Max, A );
+            real_t B_norm = slate::norm( slate::Norm::Max, B );
 
             print_matrix( "Aref_full", Aref_full, params );
             print_matrix( "Bref_full", Bref_full, params );
@@ -218,10 +169,10 @@ void test_copy_work(Params& params, bool run)
             // Do this on full m-by-n matrix to detect if on, say,
             // a lower triangular matrix, the kernel accidentally modifies
             // the upper triangle.
-            slate::add( -one, Aref_full, one, Afull );
-            slate::add( -one, Bref_full, one, Bfull );
-            real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull );
-            real_t B_diff_norm = slate::norm( slate::Norm::Max, Bfull );
+            subtract_matrices( Afull, Aref_full );
+            subtract_matrices( Bfull, Bref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full );
+            real_t B_diff_norm = slate::norm( slate::Norm::Max, Bref_full );
 
             print_matrix( "A_diff_full", Afull, params );
             print_matrix( "B_diff_full", Bfull, params );
@@ -235,7 +186,6 @@ void test_copy_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            SLATE_UNUSED( one );
             SLATE_UNUSED( A_norm );
             SLATE_UNUSED( B_norm );
             if (mpi_rank == 0)

From 2065590fc3bc18622f21d7c17715fc132506d830 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 15:45:04 -0500
Subject: [PATCH 13/19] Add nonuniform-nb to more tests in run_tests

---
 test/run_tests.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index a80751fa8..4561e217f 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -406,8 +406,8 @@ def filter_csv( values, csv ):
     [ 'getriOOP', gen + dtype + la + n ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
-    [ 'gesv_mixed',   gen + dtype_double + la + n ],
-    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' ],
+    [ 'gesv_mixed',   gen + dtype_double + la + n + nonuniform_nb ],
+    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ],
     [ 'gesv_rbt', gen + dtype + la + n ],
     ]
 
@@ -628,25 +628,25 @@ def filter_csv( values, csv ):
 # aux
 if (opts.aux):
     cmds += [
-    [ 'add',    gen + dtype + mn + ab        ],
-    [ 'tzadd',  gen + dtype + mn + ab + uplo ],
-    [ 'tradd',  gen + dtype + n  + ab + uplo ],
-    [ 'syadd',  gen + dtype + n  + ab + uplo ],
-    [ 'headd',  gen + dtype + n  + ab + uplo ],
-
-    [ 'copy',   gen + dtype + mn             ],
-    [ 'tzcopy', gen + dtype + mn      + uplo ],
-    [ 'trcopy', gen + dtype + n       + uplo ],
-    [ 'sycopy', gen + dtype + n       + uplo ],
-    [ 'hecopy', gen + dtype + n       + uplo ],
-
-    [ 'scale',   gen + dtype + mn + ab        ],
-    [ 'tzscale', gen + dtype + mn + ab + uplo ],
-    [ 'trscale', gen + dtype + n  + ab + uplo ],
-    [ 'syscale', gen + dtype + n  + ab + uplo ],
-    [ 'hescale', gen + dtype + n  + ab + uplo ],
-
-    [ 'scale_row_col', gen + dtype + mn + equed ],
+    [ 'add',    gen + dtype + mn + ab + nonuniform_nb        ],
+    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
+    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
+    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
+    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
+
+    [ 'copy',   gen + dtype + mn      + nonuniform_nb        ],
+    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + uplo ],
+    [ 'trcopy', gen + dtype + n       + nonuniform_nb + uplo ],
+    [ 'sycopy', gen + dtype + n       + nonuniform_nb + uplo ],
+    [ 'hecopy', gen + dtype + n       + nonuniform_nb + uplo ],
+
+    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb        ],
+    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ],
+    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
+    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
+    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
+
+    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ],
 
     [ 'set',    gen + dtype + mn + ab + nonuniform_nb        ],
     [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + uplo ],

From 4b8386fa1760efd8d461d24af59003a7117bda51 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 16:13:11 -0500
Subject: [PATCH 14/19] Avoid extra function calls when unneeded

---
 include/slate/BaseMatrix.hh | 52 ++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh
index cb48b5147..5e88b43e5 100644
--- a/include/slate/BaseMatrix.hh
+++ b/include/slate/BaseMatrix.hh
@@ -904,32 +904,36 @@ BaseMatrix<scalar_t>::BaseMatrix(
         ++nt_;
     }
 
-    // Pass variables into lambda w/out full this object
-    int64_t mt_1 = mt_-1;
-    int64_t last_mb = last_mb_;
-    int64_t nt_1 = nt_-1;
-    int64_t last_nb = last_nb_;
-    std::function<int64_t (int64_t i)>
-    realTileMb = [inTileMb, last_mb, mt_1](int64_t i) {
-        if (i == mt_1) {
-            return last_mb;
-        }
-        else {
-            return inTileMb( i );
-        }
-    };
-    std::function<int64_t (int64_t j)>
-    realTileNb = [inTileNb, last_nb, nt_1](int64_t j) {
-        if (j == nt_1) {
-            return last_nb;
-        }
-        else {
-            return inTileNb( j );
-        }
-    };
+    auto actTileMb = inTileMb;
+    if (last_mb_ != actTileMb( mt_-1 )) {
+        // Pass variables into lambda w/out full this object
+        int64_t mt_1 = mt_-1;
+        int64_t last_mb = last_mb_;
+        actTileMb = [inTileMb, last_mb, mt_1](int64_t i) {
+            if (i == mt_1) {
+                return last_mb;
+            }
+            else {
+                return inTileMb( i );
+            }
+        };
+    }
+    auto actTileNb = inTileNb;
+    if (last_nb_ != actTileNb( nt_-1 )) {
+        int64_t nt_1 = nt_-1;
+        int64_t last_nb = last_nb_;
+        actTileNb = [inTileNb, last_nb, nt_1](int64_t j) {
+            if (j == nt_1) {
+                return last_nb;
+            }
+            else {
+                return inTileNb( j );
+            }
+        };
+    }
 
     storage_ = std::make_shared< MatrixStorage< scalar_t > >(
-                                           mt_, nt_, realTileMb, realTileNb,
+                                           mt_, nt_, actTileMb, actTileNb,
                                            inTileRank, inTileDevice, mpi_comm );
 
     slate_mpi_call(

From 917790b2b63317ceda460d78b391bc77e1540580 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 14 Nov 2023 16:23:20 -0500
Subject: [PATCH 15/19] Fix issue in scale_row_col tester

---
 test/matrix_utils.hh       | 4 ++--
 test/test_scale_row_col.cc | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index fbc3c2a22..f4f923bb1 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -286,13 +286,13 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     matrix.m = m;
     matrix.n = n;
 
-    // ScaLAPACK variables
+    // ScaLAPACK variables for reference matrix
     int mpi_rank, myrow, mycol;
     MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
     gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
     matrix.nb = nb;
     matrix.mloc = num_local_rows_cols( m, nb, myrow, p );
-    matrix.nloc = num_local_rows_cols (n, nb, mycol, q );
+    matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
     matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
 
     // Functions for nonuniform tile sizes
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index 163862391..7c1ed4ffd 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -150,8 +150,8 @@ void test_scale_row_col_work( Params& params, bool run )
 
             // Copy local part of R.
             int64_t ii = 0, iilocal = 0;
-            for (int64_t i = 0; i < A.mt(); ++i) {
-                int64_t mb_ = A.tileMb( i );
+            for (int64_t i = 0; i < Aref_full.mt(); ++i) {
+                int64_t mb_ = Aref_full.tileMb( i );
                 if (i % p == myrow) {
                     std::copy( &R[ ii ], &R[ ii + mb_ ], &Rlocal[ iilocal ] );
                     iilocal += mb_;
@@ -162,8 +162,8 @@ void test_scale_row_col_work( Params& params, bool run )
 
             // Copy local part of R.
             int64_t jj = 0, jjlocal = 0;
-            for (int64_t j = 0; j < A.nt(); ++j) {
-                int64_t nb_ = A.tileNb( j );
+            for (int64_t j = 0; j < Aref_full.nt(); ++j) {
+                int64_t nb_ = Aref_full.tileNb( j );
                 if (j % q == mycol) {
                     std::copy( &C[ jj ], &C[ jj + nb_ ], &Clocal[ jjlocal ] );
                     jjlocal += nb_;

From e9cfd7d7f4db0d1ad8de1920ef148a06ba2d8866 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 15 Nov 2023 09:27:08 -0500
Subject: [PATCH 16/19] Remove some vestigial code

---
 test/test_copy.cc | 5 -----
 test/test_set.cc  | 2 --
 2 files changed, 7 deletions(-)

diff --git a/test/test_copy.cc b/test/test_copy.cc
index 7f19380a2..84721bb9d 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -69,11 +69,6 @@ void test_copy_work(Params& params, bool run)
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
     auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
     auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, false, m, n, params );
 
diff --git a/test/test_set.cc b/test/test_set.cc
index 9cb89e5e8..2dc088f0a 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -136,10 +136,8 @@ void test_set_work(Params& params, bool run)
             //==================================================
             double time = barrier_get_wtime(MPI_COMM_WORLD);
 
-            blas_int info;
             scalapack_plaset( uplo2str(uplo), m, n, alpha, beta,
                               &Aref_data[0], 1, 1, A_desc );
-            slate_assert(info == 0);
 
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
             params.ref_time() = time;

From 995e394d7c80aaa716ecf20a07fb52336b6e36e1 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 15 Nov 2023 09:33:45 -0500
Subject: [PATCH 17/19] Remove an assertion that broke assumptions in a few
 routines

---
 include/slate/internal/MatrixStorage.hh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh
index 5019bbaad..cea953d4e 100644
--- a/include/slate/internal/MatrixStorage.hh
+++ b/include/slate/internal/MatrixStorage.hh
@@ -110,7 +110,6 @@ public:
         auto tile = tiles_[device+1];
         tiles_[device+1] = nullptr;
         if (tile != nullptr) {
-            assert(!tile->stateOn(MOSI::OnHold));
             delete tile;
             --num_instances_;
         }

From b4b88f866792934166cc7f2e30f1e92fea889bc5 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Tue, 16 Jan 2024 14:54:47 -0500
Subject: [PATCH 18/19] Update docs, style. Use max-norm.

---
 include/slate/BaseMatrix.hh | 14 ++++++------
 test/matrix_utils.hh        | 22 +++++++++----------
 test/test_add.cc            | 12 ++++++----
 test/test_gemm.cc           |  2 +-
 test/test_utils.hh          | 44 +++++++++++++++++++++++--------------
 5 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh
index 5e88b43e5..086aff8d0 100644
--- a/include/slate/BaseMatrix.hh
+++ b/include/slate/BaseMatrix.hh
@@ -906,7 +906,7 @@ BaseMatrix<scalar_t>::BaseMatrix(
 
     auto actTileMb = inTileMb;
     if (last_mb_ != actTileMb( mt_-1 )) {
-        // Pass variables into lambda w/out full this object
+        // Pass variables into lambda w/out reference to `this`.
         int64_t mt_1 = mt_-1;
         int64_t last_mb = last_mb_;
         actTileMb = [inTileMb, last_mb, mt_1](int64_t i) {
@@ -2710,14 +2710,14 @@ void BaseMatrix<scalar_t>::tileGet(int64_t i, int64_t j, int dst_device,
                          + " -> " + std::to_string(dst_device));
         }
 
-        target_layout = layout == LayoutConvert::None ?
-                        src_tile->layout() :
-                        Layout(layout);
+        target_layout = layout == LayoutConvert::None
+                        ? src_tile->layout()
+                        : Layout(layout);
     }
     else {
-        target_layout = layout == LayoutConvert::None ?
-                        tile_node[dst_device]->layout() :
-                        Layout(layout);
+        target_layout = layout == LayoutConvert::None
+                        ? tile_node[dst_device]->layout()
+                        : Layout(layout);
     }
 
     if (! tile_node.existsOn(dst_device)) {
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index f4f923bb1..ea20fd584 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -206,7 +206,7 @@ matrix_type matrix_cast(
 }
 
 
-// -----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Functions for allocating test matrices
 
 template <typename MatrixType>
@@ -234,7 +234,7 @@ public:
     #endif
 };
 
-// -----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// Marks the paramters used by allocate_test_Matrix
 inline void mark_params_for_test_Matrix(Params& params)
 {
@@ -246,23 +246,23 @@ inline void mark_params_for_test_Matrix(Params& params)
     params.grid_order();
 }
 
-// -----------------------------------------------------------------------------
-/// Allocates a Matrix<scalar_t> and a reference version for testing.
+//------------------------------------------------------------------------------
+/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
 ///
-/// @param ref_matrix[in]
+/// @param[in] ref_matrix
 ///     Whether to allocate a reference matrix
 ///
-/// @param nonuniform_ref[in]
+/// @param[in] nonuniform_ref
 ///     If params.nonuniform_nb(), whether to also allocate the reference matrix
 ///     with non-uniform tiles.
 ///
-/// @param m[in]
+/// @param[in] m
 ///     The number of rows
 ///
-/// @param n[in]
+/// @param[in] n
 ///     The number of columns
 ///
-/// @param params[in]
+/// @param[in] params
 ///     The test params object which contains many of the key parameters
 ///
 template <typename scalar_t>
@@ -295,10 +295,10 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
     matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
 
-    // Functions for nonuniform tile sizes
+    // Functions for nonuniform tile sizes.
+    // Odd-numbered tiles are 2*nb, even-numbered tiles are nb.
     std::function< int64_t (int64_t j) >
     tileNb = [nb](int64_t j) {
-        // for non-uniform tile size
         return (j % 2 != 0 ? nb*2 : nb);
     };
     auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
diff --git a/test/test_add.cc b/test/test_add.cc
index b2d9fe6e7..3e51600b3 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -173,14 +173,18 @@ void test_add_work(Params& params, bool run)
             // the upper triangle.
             subtract_matrices( Afull, Aref_full );
             subtract_matrices( Bfull, Bref_full );
-            real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full );
-            real_t B_diff_norm = slate::norm( slate::Norm::One, Bref_full );
+            real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full );
+            real_t B_diff_norm = slate::norm( slate::Norm::Max, Bref_full );
 
             print_matrix( "A_diff_full", Afull, params );
             print_matrix( "B_diff_full", Bfull, params );
 
-            real_t errorA = A_diff_norm / (n * A_norm);
-            real_t errorB = B_diff_norm / (n * B_norm);
+            // Ideally this would be element-wise error,
+            //     max_ij( | A_ij – Aref_ij | / | Aref_ij | ),
+            // instead of norm-wise error,
+            //     max_ij( | A_ij – Aref_ij | ) / max_ij( | Aref_ij | ).
+            real_t errorA = A_diff_norm / A_norm;
+            real_t errorB = B_diff_norm / B_norm;
 
             params.error() = errorA + errorB;
             real_t tol = params.tol() * std::numeric_limits<real_t>::epsilon()/2;
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index bd5425f72..746e2f74d 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -78,7 +78,7 @@ void test_gemm_work(Params& params, bool run)
         return;
 
     #ifndef SLATE_HAVE_SCALAPACK
-        // Can only run ref when we have ScaLAPACK
+        // Can run ref only when we have ScaLAPACK.
         if (ref) {
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );
diff --git a/test/test_utils.hh b/test/test_utils.hh
index 8b4f39fd6..a946411b0 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -8,15 +8,25 @@
 
 #include "slate/slate.hh"
 
+///-----------------------------------------------------------------------------
+/// Applies the operator thunk to each element of A and B to update B.
+/// The matrices must have the same size, but can have different tile sizes and
+/// distributions. However, the elements of a tile of B must all belong to the
+/// same tile of A. For example, this is satisfied in the testers if B
+/// has tiles of size nb and A has tiles of size nb or 2*nb.
+///
 template <typename matrix_type>
-void matrix_iterator( matrix_type& A, matrix_type& B,
-                      std::function<void(const typename matrix_type::value_type&,
-                                         typename matrix_type::value_type&)> thunk )
+void matrix_iterator(
+    matrix_type& A, matrix_type& B,
+    std::function< void( typename matrix_type::value_type const&,
+                         typename matrix_type::value_type& ) > thunk )
 {
     using scalar_t = typename matrix_type::value_type;
     assert( A.m() == B.m() );
     assert( A.n() == B.n() );
 
+    const auto ColMajor = slate::LayoutConvert::ColMajor;
+
     int64_t B_mt = B.mt();
     int64_t B_nt = B.nt();
     if constexpr (std::is_same<matrix_type, slate::Matrix<scalar_t>>::value) {
@@ -33,8 +43,8 @@ void matrix_iterator( matrix_type& A, matrix_type& B,
                         A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
                                     slate::Layout::ColMajor );
 
-                        A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor );
-                        B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor );
+                        A.tileGetForReading( A_i, A_j, ColMajor );
+                        B.tileGetForWriting( B_i, B_j, ColMajor );
                         auto TA = A( A_i, A_j );
                         auto TB = B( B_i, B_j );
                         int64_t mb = TB.mb();
@@ -79,12 +89,12 @@ void matrix_iterator( matrix_type& A, matrix_type& B,
     A.releaseRemoteWorkspace();
 }
 
-// -----------------------------------------------------------------------------
-// subtract_matrices takes input matrices A and B, and performs B = B - A.
-// The matrices must have the same size, but can have different tile sizes and
-// distributions.  However, the elements of a tile of B must all belong to the
-// same tile of A
-
+///-----------------------------------------------------------------------------
+/// subtract_matrices takes input matrices A and B, and performs B = B - A.
+/// The matrices must have the same size, but can have different tile sizes and
+/// distributions. However, the elements of a tile of B must all belong to the
+/// same tile of A.
+///
 template <typename matrix_type>
 void subtract_matrices( matrix_type& A, matrix_type& B )
 {
@@ -93,12 +103,12 @@ void subtract_matrices( matrix_type& A, matrix_type& B )
     matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } );
 }
 
-// -----------------------------------------------------------------------------
-// copy_matrix copies A to B
-// The matrices must have the same size, but can have different tile sizes and
-// distributions.  However, the elements of a tile of B must all belong to the
-// same tile of A
-
+///-----------------------------------------------------------------------------
+/// copy_matrix copies A to B
+/// The matrices must have the same size, but can have different tile sizes and
+/// distributions. However, the elements of a tile of B must all belong to the
+/// same tile of A.
+///
 template <typename matrix_type>
 void copy_matrix( matrix_type& A, matrix_type& B )
 {

From 2bb33f39421c9e196cdeff09d8b9a9c5acf0becd Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Tue, 16 Jan 2024 17:04:37 -0500
Subject: [PATCH 19/19] Add msg for nonuniform nb

---
 test/matrix_utils.hh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index ea20fd584..7d7f44a96 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -311,6 +311,8 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         // SLATE allocates CPU or GPU tiles.
         slate::Target origin_target = origin2target( origin );
         if (nonuniform_nb) {
+            params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) )
+                         + ", "             + std::to_string( tileNb( 1 ) );
             matrix.A = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
                                                 tileDevice, MPI_COMM_WORLD);
         }