From 3d6965eb88dfe53b2e7cb82196e023136db755b1 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 10:01:53 -0500
Subject: [PATCH 01/33] Add nonuniform nb support to test_posv

---
 test/matrix_utils.hh | 121 +++++++++++++++++++++++++++++++++++++++
 test/test.hh         |   6 +-
 test/test_posv.cc    | 132 ++++++++-----------------------------------
 3 files changed, 148 insertions(+), 111 deletions(-)
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 7d7f44a96..750afddba 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -349,4 +349,125 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     return matrix;
 }
 
+// -----------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_HermitianMatrix
+inline void mark_params_for_test_HermitianMatrix(Params& params)
+{
+    params.grid.m();
+    params.grid.n();
+    params.dev_dist();
+    params.uplo();
+    params.nb();
+    params.nonuniform_nb();
+    params.origin();
+    params.grid_order();
+}
+
+// -----------------------------------------------------------------------------
+/// Allocates a HermitianMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    int64_t p = params.grid.m();
+    int64_t q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<slate::HermitianMatrix<scalar_t>> matrix;
+    matrix.m = n;
+    matrix.n = n;
+
+    // ScaLAPACK variables for reference matrix
+    int mpi_rank, myrow, mycol;
+    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
+    matrix.nb = nb;
+    matrix.mloc = num_local_rows_cols( n, nb, myrow, p );
+    matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
+    matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
+
+    // Functions for nonuniform tile sizes or row device distribution
+    std::function< int64_t (int64_t j) > tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+    }
+    else {
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        if (nonuniform_nb || dev_dist == slate::Dist::Row) {
+            matrix.A = slate::HermitianMatrix<scalar_t>(
+                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = slate::HermitianMatrix<scalar_t>(
+                    uplo, n, nb, p, q, MPI_COMM_WORLD);
+        }
+
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        matrix.A.insertLocalTiles(origin_target);
+    }
+    else {
+        assert( !nonuniform_nb );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
+                uplo, n, &matrix.A_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD);
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.A = slate::HermitianMatrix<scalar_t>(
+                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
+                       uplo, n, &matrix.Aref_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD);
+        }
+    }
+
+    return matrix;
+}
+
+
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/test.hh b/test/test.hh
index b0d48e0f3..0c62fb9f0 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -27,9 +27,9 @@ enum class Origin {
     Devices,
 };
 
-enum class Dist {
-    Row,
-    Col,
+enum class Dist : char {
+    Row = 'R',
+    Col = 'C',
 };
 
 } // namespace slate
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 46f3c3cc6..4803fa137 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -8,12 +8,13 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
 
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "auxiliary/Debug.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -46,6 +47,7 @@ void test_posv_work(Params& params, bool run)
     int timer_level = params.timer_level();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     slate::Dist dev_dist = params.dev_dist();
     params.matrix.mark();
     params.matrixB.mark();
@@ -158,95 +160,31 @@ void test_posv_work(Params& params, bool run)
 
     int64_t info = 0;
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::HermitianMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
-
-    slate::HermitianMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, X;
-    std::vector<scalar_t> X_data;
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (dev_dist == slate::Dist::Row && target == slate::Target::Devices) {
-            // slate_assert(target == slate::Target::Devices);
-            // todo: doesn't work when lookahead is greater than 2
-            // slate_assert(lookahead < 3);
-
-            auto tileNb = slate::func::uniform_blocksize( n, nb );
-            auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col,
-                                                          p, q );
-            int num_devices = blas::get_device_count();
-            auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
-                                                           p, num_devices );
-
-            A = slate::HermitianMatrix<scalar_t>(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            B = slate::Matrix<scalar_t>(
-                    n, nrhs, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            A = slate::HermitianMatrix<scalar_t>(
-                    uplo, n, nb, p, q, MPI_COMM_WORLD);
-            B = slate::Matrix<scalar_t>(
-                    n, nrhs, nb, p, q, MPI_COMM_WORLD);
-        }
-
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A.insertLocalTiles(origin_target);
-
-        B.insertLocalTiles(origin_target);
-
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            X = slate::Matrix<scalar_t>(n, nrhs, nb, p, q, MPI_COMM_WORLD);
-            X.insertLocalTiles(origin_target);
-        }
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layouts
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                n, nrhs, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            X = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    n, nrhs, &X_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        }
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( check || ref, true, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, n, nrhs, params );
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc;
+    if (is_iterative) {
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
     }
 
+    auto& A         = A_alloc.A;
+    auto& A_data    = A_alloc.A_data;
+    auto& Aref      = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+    auto& B         = B_alloc.A;
+    auto& B_data    = B_alloc.A_data;
+    auto& Bref      = B_alloc.Aref;
+    auto& Bref_data = B_alloc.Aref_data;
+    auto& X         = X_alloc.A;
+
     slate::generate_matrix(params.matrix, A);
     slate::generate_matrix(params.matrixB, B);
     print_matrix("A", A, params);
     print_matrix("B", B, params);
 
     // if check is required, copy test data and create a descriptor for it
-    std::vector<scalar_t> Aref_data(lldA*nlocA);
-    std::vector<scalar_t> Bref_data(lldB*nlocB);
     std::vector<scalar_t> B_orig;
-    slate::HermitianMatrix<scalar_t> Aref;
-    slate::Matrix<scalar_t> Bref;
     if (check || ref) {
-        // SLATE matrix wrappers for the reference data
-        Aref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                   uplo, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   n, nrhs, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-
         slate::copy( A, Aref );
         slate::copy( B, Bref );
 
@@ -414,34 +352,12 @@ void test_posv_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
             // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], Aref_desc[9];
-            blas_int B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Aref_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, Aref_desc[9], Bref_desc[9];
+            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
             if (check) {
                 // restore Bref_data
@@ -477,9 +393,9 @@ void test_posv_work(Params& params, bool run)
 
             if (verbose > 2) {
                 if (origin == slate::Origin::ScaLAPACK) {
-                    slate::Debug::diffLapackMatrices<scalar_t>(n, n, &A_data[0], lldA, &Aref_data[0], lldA, nb, nb);
+                    slate::Debug::diffLapackMatrices<scalar_t>(n, n, &A_data[0], A_alloc.lld, &Aref_data[0], A_alloc.lld, nb, nb);
                     if (params.routine != "potrf") {
-                        slate::Debug::diffLapackMatrices<scalar_t>(n, nrhs, &B_data[0], lldB, &Bref_data[0], lldB, nb, nb);
+                        slate::Debug::diffLapackMatrices<scalar_t>(n, nrhs, &B_data[0], A_alloc.lld, &Bref_data[0], A_alloc.lld, nb, nb);
                     }
                 }
             }

From ac36b67efbad71e0726a36c6c2c3c2b452e1f927 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 10:33:36 -0500
Subject: [PATCH 02/33] Refactor out some duplicate code for TestMatrix class

---
 test/matrix_utils.hh | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 750afddba..ff780cc81 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -214,6 +214,20 @@ class TestMatrix {
     using scalar_t = typename MatrixType::value_type;
 
 public:
+    TestMatrix() {}
+
+    TestMatrix(int64_t m_in, int64_t n_in, int64_t nb_in,
+               int64_t p, int64_t q, slate::GridOrder grid_order)
+        : m(m_in), n(n_in), nb(nb_in)
+    {
+        int mpi_rank, myrow, mycol;
+        MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+        gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
+        this->mloc = num_local_rows_cols( m_in, nb, myrow, p );
+        this->nloc = num_local_rows_cols( n_in, nb, mycol, q );
+        this->lld  = blas::max( 1, mloc ); // local leading dimension of A
+    }
+
     // SLATE matrices
     MatrixType A;
     MatrixType Aref;
@@ -282,18 +296,7 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     slate::GridOrder grid_order = params.grid_order();
 
     // The object to be returned
-    TestMatrix<slate::Matrix<scalar_t>> matrix;
-    matrix.m = m;
-    matrix.n = n;
-
-    // ScaLAPACK variables for reference matrix
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
-    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-    matrix.nb = nb;
-    matrix.mloc = num_local_rows_cols( m, nb, myrow, p );
-    matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
-    matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
+    TestMatrix<slate::Matrix<scalar_t>> matrix( m, n, nb, p, q, grid_order );
 
     // Functions for nonuniform tile sizes.
     // Odd-numbered tiles are 2*nb, even-numbered tiles are nb.
@@ -400,18 +403,7 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
     slate::GridOrder grid_order = params.grid_order();
 
     // The object to be returned
-    TestMatrix<slate::HermitianMatrix<scalar_t>> matrix;
-    matrix.m = n;
-    matrix.n = n;
-
-    // ScaLAPACK variables for reference matrix
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
-    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-    matrix.nb = nb;
-    matrix.mloc = num_local_rows_cols( n, nb, myrow, p );
-    matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
-    matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
+    TestMatrix<slate::HermitianMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
 
     // Functions for nonuniform tile sizes or row device distribution
     std::function< int64_t (int64_t j) > tileNb;

From 72f871d02d96b464b0821648f135184d8db1cf36 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 12:05:57 -0500
Subject: [PATCH 03/33] Add support for dev_dist to test_gesv

---
 test/matrix_utils.hh | 29 +++++++++++----
 test/run_tests.py    | 84 ++++++++++++++++++++++----------------------
 test/test_gesv.cc    |  6 ++++
 test/test_posv.cc    | 20 +++++++----
 4 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index ff780cc81..9ee1b9c03 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -254,6 +254,7 @@ inline void mark_params_for_test_Matrix(Params& params)
 {
     params.grid.m();
     params.grid.n();
+    params.dev_dist();
     params.nb();
     params.nonuniform_nb();
     params.origin();
@@ -290,6 +291,7 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     // Load params variables
     int64_t p = params.grid.m();
     int64_t q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
     int64_t nb = params.nb();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
     slate::Origin origin = params.origin();
@@ -300,13 +302,21 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
 
     // Functions for nonuniform tile sizes.
     // Odd-numbered tiles are 2*nb, even-numbered tiles are nb.
-    std::function< int64_t (int64_t j) >
-    tileNb = [nb](int64_t j) {
-        return (j % 2 != 0 ? nb*2 : nb);
-    };
+    std::function< int64_t (int64_t j) > tileMb, tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+        tileMb = tileNb;
+    }
+    else {
+        tileMb = slate::func::uniform_blocksize( m, nb );
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
     auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
     int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
                                                    p, num_devices_ );
 
     // Setup matrix to test SLATE with
@@ -316,7 +326,9 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         if (nonuniform_nb) {
             params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) )
                          + ", "             + std::to_string( tileNb( 1 ) );
-            matrix.A = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
+        }
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+            matrix.A = slate::Matrix<scalar_t>( m, n, tileMb, tileNb, tileRank,
                                                 tileDevice, MPI_COMM_WORLD);
         }
         else {
@@ -327,6 +339,7 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     }
     else {
         assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
         // Create SLATE matrix from the ScaLAPACK layouts
         matrix.A_data.resize( matrix.lld * matrix.nloc );
         matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
@@ -337,6 +350,7 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     // Setup reference matrix
     if (ref_matrix) {
         if (nonuniform_nb && nonuniform_ref) {
+            std::cout << "Using nonuniform Bref" << std::endl;
             matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
                                                    tileDevice, MPI_COMM_WORLD );
             matrix.Aref.insertLocalTiles( slate::Target::Host );
@@ -423,7 +437,7 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
 
     // Setup matrix to test SLATE with
     if (origin != slate::Origin::ScaLAPACK) {
-        if (nonuniform_nb || dev_dist == slate::Dist::Row) {
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
             matrix.A = slate::HermitianMatrix<scalar_t>(
                     uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
         }
@@ -438,6 +452,7 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
     }
     else {
         assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
         // Create SLATE matrix from the ScaLAPACK layouts
         matrix.A_data.resize( matrix.lld * matrix.nloc );
         matrix.A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
diff --git a/test/run_tests.py b/test/run_tests.py
index 4561e217f..7bc730139 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -348,9 +348,9 @@ def filter_csv( values, csv ):
     cmds += [
     [ 'gbmm',  gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ],
 
-    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
-    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
-    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
+    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
+    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
+    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
 
     [ 'hemm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
@@ -386,29 +386,29 @@ def filter_csv( values, csv ):
 # LU
 if (opts.lu):
     cmds += [
-    [ 'gesv',         gen + dtype + la + n + thresh + nonuniform_nb ],
-    [ 'gesv_tntpiv',  gen + dtype + la + n ],
-    [ 'gesv_nopiv',   gen + dtype + la + n + nonuniform_nb
+    [ 'gesv',         gen + dtype + la + n + thresh + nonuniform_nb + ddist ],
+    [ 'gesv_tntpiv',  gen + dtype + la + n + ddist ],
+    [ 'gesv_nopiv',   gen + dtype + la + n + nonuniform_nb + ddist
                       + ' --matrix rand_dominant' ],
 
     # todo: mn
-    [ 'getrf',        gen + dtype + la + n + thresh + nonuniform_nb ],
-    [ 'getrf_tntpiv', gen + dtype + la + n ],
-    [ 'getrf_nopiv',  gen + dtype + la + n + nonuniform_nb
+    [ 'getrf',        gen + dtype + la + n + thresh + nonuniform_nb + ddist ],
+    [ 'getrf_tntpiv', gen + dtype + la + n + ddist ],
+    [ 'getrf_nopiv',  gen + dtype + la + n + nonuniform_nb + ddist
                       + ' --matrix rand_dominant' ],
 
-    [ 'getrs',        gen + dtype + la + n + trans + thresh + nonuniform_nb ],
-    [ 'getrs_tntpiv', gen + dtype + la + n + trans ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + trans + nonuniform_nb
+    [ 'getrs',        gen + dtype + la + n + trans + thresh + nonuniform_nb + ddist ],
+    [ 'getrs_tntpiv', gen + dtype + la + n + trans + ddist ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + trans + nonuniform_nb + ddist
                       + ' --matrix rand_dominant' ],
 
-    [ 'getri',    gen + dtype + la + n ],
-    [ 'getriOOP', gen + dtype + la + n ],
+    [ 'getri',    gen + dtype + la + n + ddist ],
+    [ 'getriOOP', gen + dtype + la + n + ddist ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
-    [ 'gesv_mixed',   gen + dtype_double + la + n + nonuniform_nb ],
-    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ],
-    [ 'gesv_rbt', gen + dtype + la + n ],
+    [ 'gesv_mixed',   gen + dtype_double + la + n + nonuniform_nb + ddist ],
+    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb + ddist ],
+    [ 'gesv_rbt', gen + dtype + la + n + ddist ],
     ]
 
 # LU banded
@@ -628,31 +628,31 @@ def filter_csv( values, csv ):
 # aux
 if (opts.aux):
     cmds += [
-    [ 'add',    gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-
-    [ 'copy',   gen + dtype + mn      + nonuniform_nb        ],
-    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + uplo ],
-    [ 'trcopy', gen + dtype + n       + nonuniform_nb + uplo ],
-    [ 'sycopy', gen + dtype + n       + nonuniform_nb + uplo ],
-    [ 'hecopy', gen + dtype + n       + nonuniform_nb + uplo ],
-
-    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-
-    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ],
-
-    [ 'set',    gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
-    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
-    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
+    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ddist        ],
+    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
+    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
+    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + ddist + ddist + uplo ],
+    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
+
+    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ddist        ],
+    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ddist + uplo ],
+    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
+    [ 'sycopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
+    [ 'hecopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
+
+    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ddist        ],
+    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
+    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
+    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
+    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
+
+    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist ],
+
+    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ddist        ],
+    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
+    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
+    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
+    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
     ]
 
 # ------------------------------------------------------------------------------
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index a19beadb8..db7a1a49d 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -75,6 +75,7 @@ void test_gesv_work(Params& params, bool run)
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     slate::GridOrder grid_order = params.grid_order();
+    slate::Dist dev_dist = params.dev_dist();
     params.matrix.mark();
     params.matrixB.mark();
 
@@ -156,6 +157,11 @@ void test_gesv_work(Params& params, bool run)
     if (! run)
         return;
 
+    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
+        params.msg() = "skipping: dev_dist = Row applies only to target devices";
+        return;
+    }
+
     if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
         params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
         return;
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 4803fa137..1532f9193 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -42,6 +42,7 @@ void test_posv_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     bool hold_local_workspace = params.hold_local_workspace() == 'y';
     int verbose = params.verbose();
     int timer_level = params.timer_level();
@@ -136,13 +137,18 @@ void test_posv_work(Params& params, bool run)
         {slate::Option::UseFallbackSolver, fallback},
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
     if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
-        params.msg() = "skipping: dev_dist = Row applies only to target devices";
+        params.msg() = "skipping: dev_dist = Col applies only to target devices";
+        return;
+    }
+
+    if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK";
+        return;
+    }
+
+    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
         return;
     }
 
@@ -403,6 +409,8 @@ void test_posv_work(Params& params, bool run)
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( verbose );
+            int mpi_rank;
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
             if (mpi_rank == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif

From 9dfcbd09ef45d22b2c887c691da00727618fe931 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 12:22:10 -0500
Subject: [PATCH 04/33] Refactor out some common error checks in the tester

---
 test/test_add.cc           |  6 ++++++
 test/test_copy.cc          |  6 ++++++
 test/test_gemm.cc          |  7 +++++++
 test/test_gesv.cc          | 15 +++++----------
 test/test_posv.cc          | 32 +++++++++++++-------------------
 test/test_scale.cc         |  6 ++++++
 test/test_scale_row_col.cc |  6 ++++++
 test/test_set.cc           |  6 ++++++
 test/test_utils.hh         | 30 ++++++++++++++++++++++++++++++
 9 files changed, 85 insertions(+), 29 deletions(-)

diff --git a/test/test_add.cc b/test/test_add.cc
index 3e51600b3..d5224e5a9 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -10,6 +10,7 @@
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
@@ -67,6 +68,11 @@ void test_add_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
diff --git a/test/test_copy.cc b/test/test_copy.cc
index 84721bb9d..ffeb68e9e 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -10,6 +10,7 @@
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
@@ -65,6 +66,11 @@ void test_copy_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 746e2f74d..0e2a56ac9 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -7,8 +7,10 @@
 #include "test.hh"
 #include "blas/flops.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
@@ -86,6 +88,11 @@ void test_gemm_work(Params& params, bool run)
         }
     #endif
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index db7a1a49d..737dca144 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -8,8 +8,10 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
 #include "scalapack_support_routines.hh"
@@ -72,10 +74,8 @@ void test_gesv_work(Params& params, bool run)
     int verbose = params.verbose();
     int timer_level = params.timer_level();
     SLATE_UNUSED(verbose);
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
     slate::GridOrder grid_order = params.grid_order();
-    slate::Dist dev_dist = params.dev_dist();
     params.matrix.mark();
     params.matrixB.mark();
 
@@ -154,18 +154,13 @@ void test_gesv_work(Params& params, bool run)
         depth = params.depth();
     }
 
-    if (! run)
-        return;
-
-    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
-        params.msg() = "skipping: dev_dist = Row applies only to target devices";
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
         return;
     }
 
-    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
+    if (! run)
         return;
-    }
 
     if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres")
         && ! std::is_same<real_t, double>::value) {
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 1532f9193..f50a04fe2 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -8,12 +8,12 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
-#include "scalapack_copy.hh"
 #include "auxiliary/Debug.hh"
 
 #include <cmath>
@@ -49,12 +49,13 @@ void test_posv_work(Params& params, bool run)
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     slate::GridOrder grid_order = params.grid_order();
-    slate::Dist dev_dist = params.dev_dist();
     params.matrix.mark();
     params.matrixB.mark();
     slate::Method methodTrsm = params.method_trsm();
     slate::Method methodHemm = params.method_hemm();
 
+    mark_params_for_test_HermitianMatrix( params );
+
     // Currently only posv* supports timer_level >= 2.
     std::vector<std::string> timer_lvl_support{ "posv", "posv_mixed",
                                                 "posv_mixed_gmres" };
@@ -127,6 +128,11 @@ void test_posv_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -137,21 +143,6 @@ void test_posv_work(Params& params, bool run)
         {slate::Option::UseFallbackSolver, fallback},
     };
 
-    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
-        params.msg() = "skipping: dev_dist = Col applies only to target devices";
-        return;
-    }
-
-    if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK";
-        return;
-    }
-
-    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
-        return;
-    }
-
     if ((params.routine == "posv_mixed" || params.routine == "posv_mixed_gmres")
         && ! std::is_same<real_t, double>::value) {
         params.msg() = "skipping: unsupported mixed precision; must be type=d or z";
@@ -357,7 +348,10 @@ void test_posv_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
-            // BLACS/MPI variables
+            if (nonuniform_nb) {
+                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+                return;
+            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
diff --git a/test/test_scale.cc b/test/test_scale.cc
index a98ffcb2d..b5c99c324 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -10,6 +10,7 @@
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
@@ -67,6 +68,11 @@ void test_scale_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index 7c1ed4ffd..ea9ce0da6 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -10,6 +10,7 @@
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
@@ -68,6 +69,11 @@ void test_scale_row_col_work( Params& params, bool run )
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
diff --git a/test/test_set.cc b/test/test_set.cc
index 2dc088f0a..78c0a82e1 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -10,6 +10,7 @@
 #include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
@@ -67,6 +68,11 @@ void test_set_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
diff --git a/test/test_utils.hh b/test/test_utils.hh
index a946411b0..67a3d92ee 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -8,6 +8,36 @@
 
 #include "slate/slate.hh"
 
+///-----------------------------------------------------------------------------
+/// Checks for common invalid parameter combinations
+///
+/// @return true if the configuration should be skipped
+///
+inline bool is_invalid_parameters(Params& params)
+{
+    slate::Origin origin = params.origin();
+    slate::Target target = params.target();
+    slate::Dist dev_dist = params.dev_dist();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+
+    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
+        params.msg() = "skipping: dev_dist = Col applies only to target devices";
+        return true;
+    }
+
+    if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK";
+        return true;
+    }
+
+    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
+        return true;
+    }
+
+    return false;
+}
+
 ///-----------------------------------------------------------------------------
 /// Applies the operator thunk to each element of A and B to update B.
 /// The matrices must have the same size, but can have different tile sizes and

From 8460bb8ef9d172587c9bec07f43d03b6381216ec Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 12:47:50 -0500
Subject: [PATCH 05/33] Allow TestMatrix to create ScaLAPACK contexts to reduce
 duplication

---
 test/matrix_utils.hh       | 30 ++++++++++++++++++++----------
 test/test_add.cc           |  4 +---
 test/test_copy.cc          |  4 +---
 test/test_gemm.cc          |  5 +----
 test/test_gesv.cc          |  5 +----
 test/test_posv.cc          |  5 +----
 test/test_scale.cc         |  4 +---
 test/test_scale_row_col.cc |  5 +++--
 test/test_set.cc           |  4 +---
 9 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 9ee1b9c03..18a007f14 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -9,6 +9,7 @@
 #include "slate/slate.hh"
 
 #include "scalapack_wrappers.hh"
+#include "grid_utils.hh"
 
 //------------------------------------------------------------------------------
 // Zero out B, then copy band matrix B from A.
@@ -216,15 +217,15 @@ class TestMatrix {
 public:
     TestMatrix() {}
 
-    TestMatrix(int64_t m_in, int64_t n_in, int64_t nb_in,
-               int64_t p, int64_t q, slate::GridOrder grid_order)
-        : m(m_in), n(n_in), nb(nb_in)
+    TestMatrix(int64_t m_, int64_t n_, int64_t nb_,
+               int p_, int q_, slate::GridOrder grid_order_)
+        : m(m_), n(n_), nb(nb_), p(p_), q(q_), grid_order(grid_order_)
     {
         int mpi_rank, myrow, mycol;
         MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
         gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-        this->mloc = num_local_rows_cols( m_in, nb, myrow, p );
-        this->nloc = num_local_rows_cols( n_in, nb, mycol, q );
+        this->mloc = num_local_rows_cols( m, nb, myrow, p );
+        this->nloc = num_local_rows_cols( n, nb, mycol, q );
         this->lld  = blas::max( 1, mloc ); // local leading dimension of A
     }
 
@@ -238,13 +239,22 @@ public:
 
     // ScaLAPACK configuration
     int64_t m, n, mloc, nloc, lld, nb;
+    int p, q;
+    slate::GridOrder grid_order;
 
     #ifdef SLATE_HAVE_SCALAPACK
-    void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) {
+    void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] )
+    {
         int64_t info;
         scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info);
         slate_assert(info == 0);
     }
+
+    void create_ScaLAPACK_context( blas_int* ictxt )
+    {
+        // Call free function version
+        ::create_ScaLAPACK_context( grid_order, p, q, ictxt );
+    }
     #endif
 };
 
@@ -289,8 +299,8 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         Params& params)
 {
     // Load params variables
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
+    int p = params.grid.m();
+    int q = params.grid.n();
     slate::Dist dev_dist = params.dev_dist();
     int64_t nb = params.nb();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
@@ -408,8 +418,8 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
 {
     // Load params variables
     slate::Uplo uplo = params.uplo();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
+    int p = params.grid.m();
+    int q = params.grid.n();
     slate::Dist dev_dist = params.dev_dist();
     int64_t nb = params.nb();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
diff --git a/test/test_add.cc b/test/test_add.cc
index d5224e5a9..b0fa9581d 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -50,8 +50,6 @@ void test_add_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -139,7 +137,7 @@ void test_add_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
diff --git a/test/test_copy.cc b/test/test_copy.cc
index ffeb68e9e..f6f6b6d7d 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -48,8 +48,6 @@ void test_copy_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -141,7 +139,7 @@ void test_copy_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 0e2a56ac9..8d77b7e89 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -46,8 +46,6 @@ void test_gemm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
     int64_t k = params.dim.k();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t lookahead = params.lookahead();
     bool ref_only = params.ref() == 'o';
     slate::Norm norm = params.norm();
@@ -58,7 +56,6 @@ void test_gemm_work(Params& params, bool run)
     int verbose = params.verbose();
     slate::Target target = params.target();
     slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
     slate::Method method_gemm = params.method_gemm();
     params.matrix.mark();
     params.matrixB.mark();
@@ -238,7 +235,7 @@ void test_gemm_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index 737dca144..b1ccec84c 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -61,8 +61,6 @@ void test_gesv_work(Params& params, bool run)
 
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t ib = params.ib();
     int64_t lookahead = params.lookahead();
     int64_t panel_threads = params.panel_threads();
@@ -75,7 +73,6 @@ void test_gesv_work(Params& params, bool run)
     int timer_level = params.timer_level();
     SLATE_UNUSED(verbose);
     slate::Target target = params.target();
-    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
     params.matrixB.mark();
 
@@ -393,7 +390,7 @@ void test_gesv_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
-            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
diff --git a/test/test_posv.cc b/test/test_posv.cc
index f50a04fe2..423257b2c 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -34,8 +34,6 @@ void test_posv_work(Params& params, bool run)
     slate::Uplo uplo = params.uplo();
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     bool ref_only = params.ref() == 'o';
@@ -48,7 +46,6 @@ void test_posv_work(Params& params, bool run)
     int timer_level = params.timer_level();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
-    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
     params.matrixB.mark();
     slate::Method methodTrsm = params.method_trsm();
@@ -355,7 +352,7 @@ void test_posv_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
-            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
diff --git a/test/test_scale.cc b/test/test_scale.cc
index b5c99c324..95e01f16a 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -50,8 +50,6 @@ void test_scale_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -130,7 +128,7 @@ void test_scale_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::Max, A );
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index ea9ce0da6..78fcc64ab 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -58,6 +58,7 @@ void test_scale_row_col_work( Params& params, bool run )
     bool trace = params.trace() == 'y';
     int verbose = params.verbose();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
     mark_params_for_test_Matrix( params );
@@ -144,7 +145,7 @@ void test_scale_row_col_work( Params& params, bool run )
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_max = slate::norm( slate::Norm::Max, A );
@@ -152,7 +153,7 @@ void test_scale_row_col_work( Params& params, bool run )
             std::vector<real_t> Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc );
 
             int myrow, mycol;
-            gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol );
+            gridinfo( A.mpiRank(), grid_order, p, q, &myrow, &mycol );
 
             // Copy local part of R.
             int64_t ii = 0, iilocal = 0;
diff --git a/test/test_set.cc b/test/test_set.cc
index 78c0a82e1..b32af8660 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -50,8 +50,6 @@ void test_set_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -130,7 +128,7 @@ void test_set_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::One, A );

From aa06be3c75a7c2214ae9a8e4e803a0f2b50770a5 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 12:48:13 -0500
Subject: [PATCH 06/33] Improve thread safety of matrix_iterator

---
 test/test_utils.hh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.hh b/test/test_utils.hh
index 67a3d92ee..02211a394 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -69,9 +69,10 @@ void matrix_iterator(
                 #pragma omp task shared(A, B) \
                                  firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
                 {
+                    int tag = A_i + A_j * A.mt();
                     if (B.tileIsLocal( B_i, B_j )) {
                         A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
-                                    slate::Layout::ColMajor );
+                                    slate::Layout::ColMajor, tag );
 
                         A.tileGetForReading( A_i, A_j, ColMajor );
                         B.tileGetForWriting( B_i, B_j, ColMajor );
@@ -93,7 +94,7 @@ void matrix_iterator(
                         }
                     }
                     else if (A.tileIsLocal( A_i, A_j )) {
-                        A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) );
+                        A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag );
                     }
                 }
 

From 2681a861c099dfdfc38efe036d89c465e40db7d4 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Fri, 15 Dec 2023 12:55:04 -0500
Subject: [PATCH 07/33] Change default device distribution value to match
 behavior

---
 test/test.cc       | 2 +-
 test/test_utils.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test.cc b/test/test.cc
index 078df162a..16ba41ecf 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -357,7 +357,7 @@ Params::Params():
     method_trsm   ("trsm",   4, ParamType::List, 0, str2methodTrsm,   methodTrsm2str,   "auto=auto, A=trsmA, B=trsmB"),
 
     grid_order("go",      3, ParamType::List, slate::GridOrder::Col,   str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"),
-    dev_dist  ("dev-dist",9,    ParamType::List, slate::Dist::Col,        str2dist,     dist2str,     "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"),
+    dev_dist  ("dev-dist",9,    ParamType::List, slate::Dist::Row,        str2dist,     dist2str,     "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"),
 
     //         name,      w,    type,            default,                 char2enum,         enum2char,         enum2str,         help
     layout    ("layout",  6,    ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"),
diff --git a/test/test_utils.hh b/test/test_utils.hh
index 02211a394..acedfedb5 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -20,7 +20,7 @@ inline bool is_invalid_parameters(Params& params)
     slate::Dist dev_dist = params.dev_dist();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
 
-    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
+    if (target != slate::Target::Devices && dev_dist == slate::Dist::Col) {
         params.msg() = "skipping: dev_dist = Col applies only to target devices";
         return true;
     }

From 8758c28ae20c056d8c0f19fa983fdff11d7d2557 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 18 Dec 2023 11:03:27 -0500
Subject: [PATCH 08/33] Add grid order to test/run_tests.py

---
 test/run_tests.py | 98 ++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index 7bc730139..06c375395 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -127,6 +127,7 @@
 group_opt.add_argument( '--nt',     action='store', help='default=%(default)s', default='5,10,20' )
 group_opt.add_argument( '--np',     action='store', help='number of MPI processes; default=%(default)s', default='1' )
 group_opt.add_argument( '--grid',   action='store', help='use p-by-q MPI process grid', default='' )
+group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='r' )
 group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' )
 group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' )
 group_opt.add_argument( '--matrix',  action='store', help='default=%(default)s', default='' )
@@ -302,6 +303,7 @@
 nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else ''
 nt     = ' --nt '     + opts.nt     if (opts.nt)     else ''
 grid   = ' --grid '   + opts.grid   if (opts.grid)   else ''
+grid_order = ' --grid-order  ' + opts.grid_order  if (opts.grid_order)  else ''
 repeat = ' --repeat ' + opts.repeat if (opts.repeat) else ''
 thresh = ' --thresh ' + opts.thresh if (opts.thresh) else ''
 matrix  = ' --matrix  ' + opts.matrix  if (opts.matrix)  else ''
@@ -348,9 +350,9 @@ def filter_csv( values, csv ):
     cmds += [
     [ 'gbmm',  gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ],
 
-    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
-    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
-    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ],
+    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
+    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
+    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
 
     [ 'hemm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
@@ -386,29 +388,29 @@ def filter_csv( values, csv ):
 # LU
 if (opts.lu):
     cmds += [
-    [ 'gesv',         gen + dtype + la + n + thresh + nonuniform_nb + ddist ],
-    [ 'gesv_tntpiv',  gen + dtype + la + n + ddist ],
-    [ 'gesv_nopiv',   gen + dtype + la + n + nonuniform_nb + ddist
+    [ 'gesv',         gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ],
+    [ 'gesv_tntpiv',  gen + dtype + la + n + ddist + grid_order ],
+    [ 'gesv_nopiv',   gen + dtype + la + n + ddist + grid_order + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
     # todo: mn
-    [ 'getrf',        gen + dtype + la + n + thresh + nonuniform_nb + ddist ],
-    [ 'getrf_tntpiv', gen + dtype + la + n + ddist ],
-    [ 'getrf_nopiv',  gen + dtype + la + n + nonuniform_nb + ddist
+    [ 'getrf',        gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ],
+    [ 'getrf_tntpiv', gen + dtype + la + n + ddist + grid_order ],
+    [ 'getrf_nopiv',  gen + dtype + la + n + ddist + grid_order + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getrs',        gen + dtype + la + n + trans + thresh + nonuniform_nb + ddist ],
-    [ 'getrs_tntpiv', gen + dtype + la + n + trans + ddist ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + trans + nonuniform_nb + ddist
+    [ 'getrs',        gen + dtype + la + n + trans + ddist + grid_order + nonuniform_nb + thresh ],
+    [ 'getrs_tntpiv', gen + dtype + la + n + ddist + grid_order ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + ddist + grid_order + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getri',    gen + dtype + la + n + ddist ],
-    [ 'getriOOP', gen + dtype + la + n + ddist ],
+    [ 'getri',    gen + dtype + la + n + ddist + grid_order ],
+    [ 'getriOOP', gen + dtype + la + n + ddist + grid_order ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
-    [ 'gesv_mixed',   gen + dtype_double + la + n + nonuniform_nb + ddist ],
-    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb + ddist ],
-    [ 'gesv_rbt', gen + dtype + la + n + ddist ],
+    [ 'gesv_mixed',   gen + dtype_double + la + n + ddist + grid_order + nonuniform_nb ],
+    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + ddist + grid_order + nonuniform_nb ],
+    [ 'gesv_rbt', gen + dtype + la + n + ddist + grid_order ],
     ]
 
 # LU banded
@@ -424,14 +426,14 @@ def filter_csv( values, csv ):
 # Cholesky
 if (opts.chol):
     cmds += [
-    [ 'posv',  gen + dtype + la + n + uplo ],
-    [ 'potrf', gen + dtype + la + n + uplo + ddist ],
-    [ 'potrs', gen + dtype + la + n + uplo ],
-    [ 'potri', gen + dtype + la + n + uplo ],
+    [ 'posv',  gen + dtype + la + n + uplo + ddist + grid_order ],
+    [ 'potrf', gen + dtype + la + n + uplo + ddist + grid_order ],
+    [ 'potrs', gen + dtype + la + n + uplo + ddist + grid_order ],
+    [ 'potri', gen + dtype + la + n + uplo + ddist + grid_order ],
     #[ 'porfs', gen + dtype + la + n + uplo ],
     #[ 'poequ', gen + dtype + la + n ],  # only diagonal elements (no uplo)
-    [ 'posv_mixed', gen + dtype_double + la + n + uplo ],
-    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + uplo + ' --nrhs 1' ],
+    [ 'posv_mixed', gen + dtype_double + la + n + uplo + ddist + grid_order ],
+    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + uplo + ddist + grid_order ],
     [ 'trtri', gen + dtype + la + n + uplo + diag ],
     ]
 
@@ -628,31 +630,31 @@ def filter_csv( values, csv ):
 # aux
 if (opts.aux):
     cmds += [
-    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ddist        ],
-    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
-    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
-    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + ddist + ddist + uplo ],
-    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
-
-    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ddist        ],
-    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ddist + uplo ],
-    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
-    [ 'sycopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
-    [ 'hecopy', gen + dtype + n       + nonuniform_nb + ddist + uplo ],
-
-    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ddist        ],
-    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
-    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
-    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
-    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + ddist + uplo ],
-
-    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist ],
-
-    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ddist        ],
-    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ],
-    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
-    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
-    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + ddist + uplo ],
+    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
+    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+
+    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ddist + grid_order        ],
+    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'sycopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'hecopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
+
+    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
+    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
+
+    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist + grid_order ],
+
+    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
+    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
     ]
 
 # ------------------------------------------------------------------------------

From fa4a5c361570d976a44655b417dd2ba23ceb6093 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 18 Dec 2023 15:31:02 -0500
Subject: [PATCH 09/33] Add options to hemm and symm, plus fix some other minor
 issues

---
 test/matrix_utils.hh | 110 +++++++++++++++++------
 test/run_tests.py    |   8 +-
 test/test.hh         |   8 +-
 test/test_gemm.cc    |  10 +--
 test/test_hemm.cc    | 190 ++++++++++++++++-----------------------
 test/test_posv.cc    |   1 +
 test/test_symm.cc    | 208 ++++++++++++++++++-------------------------
 7 files changed, 259 insertions(+), 276 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 18a007f14..59f9f3aa4 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -360,7 +360,6 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     // Setup reference matrix
     if (ref_matrix) {
         if (nonuniform_nb && nonuniform_ref) {
-            std::cout << "Using nonuniform Bref" << std::endl;
             matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
                                                    tileDevice, MPI_COMM_WORLD );
             matrix.Aref.insertLocalTiles( slate::Target::Host );
@@ -391,26 +390,18 @@ inline void mark_params_for_test_HermitianMatrix(Params& params)
 }
 
 // -----------------------------------------------------------------------------
-/// Allocates a HermitianMatrix<scalar_t> and a reference version for testing.
-///
-/// @param ref_matrix[in]
-///     Whether to allocate a reference matrix
-///
-/// @param nonuniform_ref[in]
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param m[in]
-///     The number of rows
-///
-/// @param n[in]
-///     The number of columns
-///
-/// @param params[in]
-///     The test params object which contains many of the key parameters
+/// Marks the paramters used by allocate_test_SymmetricMatrix
+inline void mark_params_for_test_SymmetricMatrix(Params& params)
+{
+    mark_params_for_test_HermitianMatrix( params );
+}
+
+
+// -----------------------------------------------------------------------------
+/// Helper routine to avoid duplicating logic between HermitianMatrix and SymmetricMatrix
 ///
-template <typename scalar_t>
-TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+template <typename matrixtype>
+TestMatrix<matrixtype> allocate_test_HeSyMatrix(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t n,
@@ -427,7 +418,7 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
     slate::GridOrder grid_order = params.grid_order();
 
     // The object to be returned
-    TestMatrix<slate::HermitianMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
+    TestMatrix<matrixtype> matrix ( n, n, nb, p, q, grid_order );
 
     // Functions for nonuniform tile sizes or row device distribution
     std::function< int64_t (int64_t j) > tileNb;
@@ -448,12 +439,12 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
     // Setup matrix to test SLATE with
     if (origin != slate::Origin::ScaLAPACK) {
         if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = slate::HermitianMatrix<scalar_t>(
+            matrix.A = matrixtype(
                     uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
         }
         else {
-            matrix.A = slate::HermitianMatrix<scalar_t>(
-                    uplo, n, nb, p, q, MPI_COMM_WORLD);
+            matrix.A = matrixtype(
+                    uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD);
         }
 
         // SLATE allocates CPU or GPU tiles.
@@ -465,26 +456,87 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
         assert( dev_dist == slate::Dist::Row );
         // Create SLATE matrix from the ScaLAPACK layouts
         matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &matrix.A_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD);
+        matrix.A = matrixtype::fromScaLAPACK(
+                    uplo, n, &matrix.A_data[0], matrix.lld, nb,
+                    grid_order, p, q, MPI_COMM_WORLD);
     }
 
     // Setup reference matrix
     if (ref_matrix) {
         if (nonuniform_nb && nonuniform_ref) {
-            matrix.A = slate::HermitianMatrix<scalar_t>(
+            matrix.A = matrixtype(
                     uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
             matrix.Aref.insertLocalTiles( slate::Target::Host );
         }
         else {
             matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, n, &matrix.Aref_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD);
+            matrix.Aref = matrixtype::fromScaLAPACK(
+                            uplo, n, &matrix.Aref_data[0], matrix.lld, nb,
+                            grid_order, p, q, MPI_COMM_WORLD);
         }
     }
 
     return matrix;
 }
 
+// -----------------------------------------------------------------------------
+/// Allocates a HermitianMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::HermitianMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
+
+// -----------------------------------------------------------------------------
+/// Allocates a SymmetricMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::SymmetricMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
 
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/run_tests.py b/test/run_tests.py
index 06c375395..b691e3523 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -354,10 +354,10 @@ def filter_csv( values, csv ):
     [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
     [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
 
-    [ 'hemm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
+    [ 'hemm',  gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
-    [ 'hemmA', gen_no_target + dtype + la + side + uplo   + mn + ab + matrixBC ],
-    [ 'hemmC', gen + dtype         + la + side + uplo     + mn + ab + matrixBC],
+    [ 'hemmA', gen_no_target + dtype + la + side + uplo + ddist + grid_order   + mn + ab + matrixBC ],
+    [ 'hemmC', gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC],
 
     [ 'hbmm',  gen + dtype         + la + side + uplo     + mn + ab + kd + matrixBC ],
 
@@ -367,7 +367,7 @@ def filter_csv( values, csv ):
     [ 'her2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixBC ],
     [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ],
 
-    [ 'symm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
+    [ 'symm',  gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC ],
 
     [ 'syr2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixC ],
     [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ],
diff --git a/test/test.hh b/test/test.hh
index 0c62fb9f0..57c428443 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -21,10 +21,10 @@
 // -----------------------------------------------------------------------------
 namespace slate {
 
-enum class Origin {
-    Host,
-    ScaLAPACK,
-    Devices,
+enum class Origin : char {
+    Host = 'H',
+    ScaLAPACK = 'S',
+    Devices = 'D',
 };
 
 enum class Dist : char {
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 8d77b7e89..ca1356fb4 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -76,6 +76,11 @@ void test_gemm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     #ifndef SLATE_HAVE_SCALAPACK
         // Can run ref only when we have ScaLAPACK.
         if (ref) {
@@ -85,11 +90,6 @@ void test_gemm_work(Params& params, bool run)
         }
     #endif
 
-    // Check for common invalid combinations
-    if (is_invalid_parameters( params )) {
-        return;
-    }
-
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
diff --git a/test/test_hemm.cc b/test/test_hemm.cc
index 946a60798..45d5f50a2 100644
--- a/test/test_hemm.cc
+++ b/test/test_hemm.cc
@@ -8,10 +8,12 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -45,12 +47,12 @@ void test_hemm_work(Params& params, bool run)
     int p = params.grid.m();
     int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
@@ -59,6 +61,9 @@ void test_hemm_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -72,6 +77,20 @@ void test_hemm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
+    #ifndef SLATE_HAVE_SCALAPACK
+        // Can only run ref when we have ScaLAPACK
+        if (ref) {
+            if (mpi_rank == 0)
+                printf( "ScaLAPACK not available\n" );
+            ref = false;
+        }
+    #endif
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -85,7 +104,6 @@ void test_hemm_work(Params& params, bool run)
 
     // sizes of data
     int64_t An = (side == slate::Side::Left ? m : n);
-    int64_t Am = An;
     int64_t Bm = m;
     int64_t Bn = n;
     int64_t Cm = m;
@@ -96,82 +114,45 @@ void test_hemm_work(Params& params, bool run)
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
     gridinfo(mpi_rank, p, q, &myrow, &mycol);
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::HermitianMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::HermitianMatrix<scalar_t>(uplo, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
 
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::Matrix<scalar_t>(Cm, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A);
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        std::vector<scalar_t> Cref_data;
-        slate::Matrix<scalar_t> Cref;
-        if (check || ref) {
-            // For simplicity, always use ScaLAPACK format for ref matrices.
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy(C, Cref);
-        }
-    #endif
+    if (ref) {
+        slate::copy( C, Cref );
+    }
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         if (side == slate::Side::Left ) {
             // Compute Y = alpha A * (B * X) + (beta C * X).
@@ -237,6 +218,9 @@ void test_hemm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -254,57 +238,36 @@ void test_hemm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
+            if (nonuniform_nb) {
+                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+                return;
+            }
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
 
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
                 copy(A, &A_data[0], A_desc);
                 copy(B, &B_data[0], B_desc);
                 copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocA + mlocA + ldw);
-            std::vector<real_t> worklange(std::max({mlocC, nlocC, mlocB, nlocB}));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An,
-                                             &A_data[0], 1, 1, A_desc, &worklansy[0]);
-            real_t B_norm = scalapack_plange(
-                norm2str(norm), Bm, Bn, &B_data[0],1, 1, B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plange(
-                norm2str(norm), Cm, Cn, &Cref_data[0], 1, 1, Cref_desc, &worklange[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -315,12 +278,13 @@ void test_hemm_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
+
+            print_matrix( "Diff", C, params );
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 423257b2c..6193a07ef 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -52,6 +52,7 @@ void test_posv_work(Params& params, bool run)
     slate::Method methodHemm = params.method_hemm();
 
     mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
 
     // Currently only posv* supports timer_level >= 2.
     std::vector<std::string> timer_lvl_support{ "posv", "posv_mixed",
diff --git a/test/test_symm.cc b/test/test_symm.cc
index c80dd3d98..6a8dc515d 100644
--- a/test/test_symm.cc
+++ b/test/test_symm.cc
@@ -6,11 +6,14 @@
 #include "slate/slate.hh"
 #include "test.hh"
 #include "blas/flops.hh"
+#include "print_matrix.hh"
+
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,18 +41,21 @@ void test_symm_work(Params& params, bool run)
     int p = params.grid.m();
     int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -63,6 +69,20 @@ void test_symm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
+    #ifndef SLATE_HAVE_SCALAPACK
+        // Can only run ref when we have ScaLAPACK
+        if (ref) {
+            if (mpi_rank == 0)
+                printf( "ScaLAPACK not available\n" );
+            ref = false;
+        }
+    #endif
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -73,7 +93,6 @@ void test_symm_work(Params& params, bool run)
 
     // sizes of data
     int64_t An = (side == slate::Side::Left ? m : n);
-    int64_t Am = An;
     int64_t Bm = m;
     int64_t Bn = n;
     int64_t Cm = m;
@@ -84,91 +103,45 @@ void test_symm_work(Params& params, bool run)
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
     gridinfo(mpi_rank, p, q, &myrow, &mycol);
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::SymmetricMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::SymmetricMatrix<scalar_t>(uplo, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_SymmetricMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
 
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::Matrix<scalar_t>(Cm, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A);
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data and create a descriptor for it.
-        slate::Matrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       Cm, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
-
-    if (side == slate::Side::Left)
-        slate_assert(A.mt() == C.mt());
-    else
-        slate_assert(A.mt() == C.nt());
-    slate_assert(B.mt() == C.mt());
-    slate_assert(B.nt() == C.nt());
+    if (ref) {
+        slate::copy( C, Cref );
+    }
 
-    if (trace) slate::trace::Trace::on();
-    else slate::trace::Trace::off();
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         if (side == slate::Side::Left ) {
             // Compute Y = alpha A * (B * X) + (beta C * X).
@@ -192,6 +165,16 @@ void test_symm_work(Params& params, bool run)
             throw slate::Exception("unknown side");
     }
 
+    if (side == slate::Side::Left)
+        slate_assert(A.mt() == C.mt());
+    else
+        slate_assert(A.mt() == C.nt());
+    slate_assert(B.mt() == C.mt());
+    slate_assert(B.nt() == C.nt());
+
+    if (trace) slate::trace::Trace::on();
+    else slate::trace::Trace::off();
+
     double time = barrier_get_wtime(MPI_COMM_WORLD);
 
     //==================================================
@@ -218,6 +201,9 @@ void test_symm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -235,57 +221,36 @@ void test_symm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
+            if (nonuniform_nb) {
+                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+                return;
+            }
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
 
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
                 copy(A, &A_data[0], A_desc);
                 copy(B, &B_data[0], B_desc);
                 copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            size_t ldw = nb*ceildiv( ceildiv( mlocA, nb ),
-                                     scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocA + mlocA + ldw);
-            std::vector<real_t> worklange(std::max({mlocC, nlocC, mlocB, nlocB}));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An,
-                                             &A_data[0], 1, 1, A_desc, &worklansy[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1,
-                                             B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -297,12 +262,13 @@ void test_symm_work(Params& params, bool run)
             MPI_Barrier(MPI_COMM_WORLD);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
+
+            print_matrix( "Diff", C, params );
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm

From 08272710ff3857cfc778ee84fdf5792f19acde08 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 18 Dec 2023 15:53:03 -0500
Subject: [PATCH 10/33] Refactor out the handling of some reference warning
 messages

---
 test/test_add.cc           |  2 +-
 test/test_copy.cc          |  2 +-
 test/test_gemm.cc          | 14 --------------
 test/test_gesv.cc          |  7 +------
 test/test_hemm.cc          | 23 +----------------------
 test/test_posv.cc          |  9 +--------
 test/test_scale.cc         |  2 +-
 test/test_scale_row_col.cc |  2 +-
 test/test_set.cc           |  2 +-
 test/test_symm.cc          | 23 +----------------------
 test/test_utils.hh         | 14 ++++++++++++++
 11 files changed, 23 insertions(+), 77 deletions(-)

diff --git a/test/test_add.cc b/test/test_add.cc
index b0fa9581d..b6fa65700 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -199,7 +199,7 @@ void test_add_work(Params& params, bool run)
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( trans );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_copy.cc b/test/test_copy.cc
index f6f6b6d7d..f8aa9bd1c 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -187,7 +187,7 @@ void test_copy_work(Params& params, bool run)
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( A_norm );
             SLATE_UNUSED( B_norm );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index ca1356fb4..359b0912b 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -52,7 +52,6 @@ void test_gemm_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     slate::Target target = params.target();
     slate::Origin origin = params.origin();
@@ -81,15 +80,6 @@ void test_gemm_work(Params& params, bool run)
         return;
     }
 
-    #ifndef SLATE_HAVE_SCALAPACK
-        // Can run ref only when we have ScaLAPACK.
-        if (ref) {
-            if (mpi_rank == 0)
-                printf( "ScaLAPACK not available\n" );
-            ref = false;
-        }
-    #endif
-
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -228,10 +218,6 @@ void test_gemm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index b1ccec84c..422dc686a 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -68,7 +68,6 @@ void test_gesv_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     int timer_level = params.timer_level();
     SLATE_UNUSED(verbose);
@@ -383,10 +382,6 @@ void test_gesv_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
@@ -431,7 +426,7 @@ void test_gesv_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_hemm.cc b/test/test_hemm.cc
index 45d5f50a2..b71708064 100644
--- a/test/test_hemm.cc
+++ b/test/test_hemm.cc
@@ -44,15 +44,12 @@ void test_hemm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
@@ -82,15 +79,6 @@ void test_hemm_work(Params& params, bool run)
         return;
     }
 
-    #ifndef SLATE_HAVE_SCALAPACK
-        // Can only run ref when we have ScaLAPACK
-        if (ref) {
-            if (mpi_rank == 0)
-                printf( "ScaLAPACK not available\n" );
-            ref = false;
-        }
-    #endif
-
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -109,11 +97,6 @@ void test_hemm_work(Params& params, bool run)
     int64_t Cm = m;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
     auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( false, true, An, params );
     auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
     auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
@@ -238,10 +221,6 @@ void test_hemm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
@@ -300,7 +279,7 @@ void test_hemm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 6193a07ef..161413a70 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -40,7 +40,6 @@ void test_posv_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     bool hold_local_workspace = params.hold_local_workspace() == 'y';
     int verbose = params.verbose();
     int timer_level = params.timer_level();
@@ -346,10 +345,6 @@ void test_posv_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
@@ -401,9 +396,7 @@ void test_posv_work(Params& params, bool run)
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( verbose );
-            int mpi_rank;
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_scale.cc b/test/test_scale.cc
index 95e01f16a..d6a71c5af 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -168,7 +168,7 @@ void test_scale_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index 78fcc64ab..e239beb0d 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -238,7 +238,7 @@ void test_scale_row_col_work( Params& params, bool run )
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( verbose );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_set.cc b/test/test_set.cc
index b32af8660..5e1fdf024 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -164,7 +164,7 @@ void test_set_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_symm.cc b/test/test_symm.cc
index 6a8dc515d..50bb601ec 100644
--- a/test/test_symm.cc
+++ b/test/test_symm.cc
@@ -38,15 +38,12 @@ void test_symm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
@@ -74,15 +71,6 @@ void test_symm_work(Params& params, bool run)
         return;
     }
 
-    #ifndef SLATE_HAVE_SCALAPACK
-        // Can only run ref when we have ScaLAPACK
-        if (ref) {
-            if (mpi_rank == 0)
-                printf( "ScaLAPACK not available\n" );
-            ref = false;
-        }
-    #endif
-
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -98,11 +86,6 @@ void test_symm_work(Params& params, bool run)
     int64_t Cm = m;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
     auto A_alloc = allocate_test_SymmetricMatrix<scalar_t>( false, true, An, params );
     auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
     auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
@@ -221,10 +204,6 @@ void test_symm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
@@ -283,7 +262,7 @@ void test_symm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_utils.hh b/test/test_utils.hh
index acedfedb5..c16d334a7 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -35,6 +35,20 @@ inline bool is_invalid_parameters(Params& params)
         return true;
     }
 
+    #ifdef SLATE_HAVE_SCALAPACK
+        if (nonuniform_nb && params.ref()) {
+            params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+            params.ref() = false;
+        }
+    #else
+        // Can only run ref when we have ScaLAPACK
+        if (params.ref()) {
+            params.msg() = "skipping reference: ScaLAPACK not available";
+            params.ref() = false;
+        }
+    #endif
+
+
     return false;
 }
 

From 08d43b590fd4747e0363822157ba0d5f197c8d04 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 18 Dec 2023 16:34:32 -0500
Subject: [PATCH 11/33] Add options to sy/herk and sy/her2k

---
 test/run_tests.py  |  16 ++--
 test/test_her2k.cc | 198 +++++++++++++++++----------------------------
 test/test_herk.cc  | 176 +++++++++++++++-------------------------
 test/test_syr2k.cc | 190 +++++++++++++++----------------------------
 test/test_syrk.cc  | 166 ++++++++++++++-----------------------
 5 files changed, 271 insertions(+), 475 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index b691e3523..584b96bf7 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -361,19 +361,19 @@ def filter_csv( values, csv ):
 
     [ 'hbmm',  gen + dtype         + la + side + uplo     + mn + ab + kd + matrixBC ],
 
-    [ 'herk',  gen + dtype_real    + la + uplo + trans    + mn + ab + matrixC ],
-    [ 'herk',  gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixC ],
 
-    [ 'her2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixBC ],
-    [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixBC ],
 
     [ 'symm',  gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC ],
 
-    [ 'syr2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixC ],
-    [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixC ],
 
-    [ 'syrk',  gen + dtype_real    + la + uplo + trans    + mn + ab + matrixBC ],
-    [ 'syrk',  gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixBC ],
 
     # todo: tbsm fails for nb=8 or 16 with --quick.
     [ 'tbsm',  gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ],
diff --git a/test/test_her2k.cc b/test/test_her2k.cc
index a2af3cef2..3c0ec777e 100644
--- a/test/test_her2k.cc
+++ b/test/test_her2k.cc
@@ -8,10 +8,12 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,10 +40,7 @@ void test_her2k_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     real_t beta = params.beta.get<real_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -53,6 +52,9 @@ void test_her2k_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -66,6 +68,11 @@ void test_her2k_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -81,72 +88,32 @@ void test_her2k_work(Params& params, bool run)
     int64_t An = (trans == slate::Op::NoTrans ? k : n);
     int64_t Bm = Am;
     int64_t Bn = An;
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector< scalar_t > A_data(lldA*nlocA);
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-    std::vector< scalar_t > B_data(lldB*nlocB);
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-    std::vector< scalar_t > C_data(lldC*nlocC);
-
-    slate::Matrix<scalar_t> A, B;
-    slate::HermitianMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::HermitianMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_HermitianMatrix<scalar_t>( ref, true, Cn, params );
+
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        slate::HermitianMatrix<scalar_t> Cref;
-        std::vector< scalar_t > Cref_data;
-        if (ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-            print_matrix("Initial Cref", Cref, params);
-        }
-    #endif
+    if (ref) {
+        slate::copy( C, Cref );
+    }
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A and B matrices,
     // and make a shallow copy of them for transposing.
@@ -172,17 +139,19 @@ void test_her2k_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = (alpha A (B^H X)) + (conj(alpha) B (A^H X)) + (beta C X).
         // Y = beta C X
@@ -221,6 +190,9 @@ void test_her2k_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -239,54 +211,30 @@ void test_her2k_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
-            copy( B, &B_data[0], B_desc );
-            copy( C, &C_data[0], C_desc );
-
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy( 2*nlocC + mlocC + ldw );
-            std::vector<real_t> worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) );
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1,
-                                             A_desc, &worklange[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1,
-                                             B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
+                // Copy SLATE result back from GPU or CPU tiles.
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+                copy(C, &C_data[0], C_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -300,13 +248,11 @@ void test_her2k_work(Params& params, bool run)
 
             print_matrix("Cref", Cref, params);
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -323,7 +269,7 @@ void test_her2k_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_herk.cc b/test/test_herk.cc
index 8e5660241..9435bb550 100644
--- a/test/test_herk.cc
+++ b/test/test_herk.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -35,10 +37,7 @@ void test_herk_work(Params& params, bool run)
     int64_t k = params.dim.k();
     real_t alpha = params.alpha.get<real_t>();
     real_t beta = params.beta.get<real_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -49,6 +48,9 @@ void test_herk_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -62,6 +64,11 @@ void test_herk_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -75,64 +82,28 @@ void test_herk_work(Params& params, bool run)
     // setup so op(A) is n-by-k
     int64_t Am = (transA == slate::Op::NoTrans ? n : k);
     int64_t An = (transA == slate::Op::NoTrans ? k : n);
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::Matrix<scalar_t> A;
-    slate::HermitianMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto C_alloc = allocate_test_HermitianMatrix<scalar_t>( ref, true, Cn, params );
 
-        C = slate::HermitianMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        C = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        slate::HermitianMatrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
+    if (ref) {
+        slate::copy( C, Cref );
+    }
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -147,17 +118,19 @@ void test_herk_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD);
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = alpha A (A^H X) + (beta C X).
         // Y = beta C X
@@ -189,6 +162,9 @@ void test_herk_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -207,48 +183,26 @@ void test_herk_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
-            copy( C, &C_data[0], C_desc );
-
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocC + mlocC + ldw);
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1,
-                                             A_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
+                // Copy SLATE result back from GPU or CPU tiles.
+                copy(A, &A_data[0], A_desc);
+                copy(C, &C_data[0], C_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -259,13 +213,11 @@ void test_herk_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm
@@ -281,7 +233,7 @@ void test_herk_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc
index 2ae006ce1..e3fa8cc22 100644
--- a/test/test_syr2k.cc
+++ b/test/test_syr2k.cc
@@ -8,10 +8,12 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,10 +40,7 @@ void test_syr2k_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -53,6 +52,9 @@ void test_syr2k_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -66,6 +68,11 @@ void test_syr2k_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -79,77 +86,32 @@ void test_syr2k_work(Params& params, bool run)
     int64_t An = (trans == slate::Op::NoTrans ? k : n);
     int64_t Bm = Am;
     int64_t Bn = An;
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::Matrix<scalar_t> A, B;
-    slate::SymmetricMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref, true, Cn, params );
 
-        C = slate::SymmetricMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        slate::SymmetricMatrix<scalar_t> Cref;
-        std::vector< scalar_t > Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-            print_matrix("Initial Cref", Cref, params);
-        }
-    #endif
+    if (ref) {
+        slate::copy( C, Cref );
+    }
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A and B matrices,
     // and make a shallow copy of them for transposing.
@@ -175,17 +137,19 @@ void test_syr2k_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = (alpha A (B^T X)) + alpha B (A^T X)) + (beta C X).
         // Y = beta C X
@@ -224,6 +188,9 @@ void test_syr2k_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -242,54 +209,31 @@ void test_syr2k_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( B, &B_data[0], B_desc );
-                copy( C, &C_data[0], C_desc );
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+                copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy( 2*nlocC + mlocC + ldw );
-            std::vector<real_t> worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) );
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -302,11 +246,11 @@ void test_syr2k_work(Params& params, bool run)
 
             print_matrix("Cref", Cref, params);
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -323,7 +267,7 @@ void test_syr2k_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else   // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_syrk.cc b/test/test_syrk.cc
index 03d6de765..c8683b837 100644
--- a/test/test_syrk.cc
+++ b/test/test_syrk.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -35,10 +37,7 @@ void test_syrk_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -49,6 +48,9 @@ void test_syrk_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -62,6 +64,11 @@ void test_syrk_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -73,64 +80,28 @@ void test_syrk_work(Params& params, bool run)
     // setup so op(A) is n-by-k
     int64_t Am = (transA == slate::Op::NoTrans ? n : k);
     int64_t An = (transA == slate::Op::NoTrans ? k : n);
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::Matrix<scalar_t> A;
-    slate::SymmetricMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto C_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref, true, Cn, params );
 
-        C = slate::SymmetricMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        C = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        slate::SymmetricMatrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
+    if (ref) {
+        slate::copy( C, Cref );
+    }
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, C_orig_norm=0;
+    if (ref) {
+        A_norm = slate::norm(norm, A);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -145,17 +116,19 @@ void test_syrk_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD);
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = alpha A (A^T X) + (beta C X).
         // Y = beta C X
@@ -187,6 +160,9 @@ void test_syrk_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -205,49 +181,27 @@ void test_syrk_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( C, &C_data[0], C_desc );
+                copy(A, &A_data[0], A_desc);
+                copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocC + mlocC + ldw);
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -257,11 +211,11 @@ void test_syrk_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime( MPI_COMM_WORLD ) - time;
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm
@@ -277,7 +231,7 @@ void test_syrk_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }

From 799cd98127fded46d2f0f8a0708386d9b28c6e12 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Mon, 18 Dec 2023 16:41:34 -0500
Subject: [PATCH 12/33] Consolidate some if statements

---
 test/test_gemm.cc  | 15 ++++++---------
 test/test_hemm.cc  |  6 ++----
 test/test_her2k.cc |  6 ++----
 test/test_herk.cc  |  6 ++----
 test/test_symm.cc  |  6 ++----
 test/test_syr2k.cc |  6 ++----
 test/test_syrk.cc  |  6 ++----
 7 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 359b0912b..c20db8ea0 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -110,9 +110,14 @@ void test_gemm_work(Params& params, bool run)
     slate::generate_matrix(params.matrixB, B);
     slate::generate_matrix(params.matrixC, C);
 
-    // if reference run is required, copy test data.
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
         slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
     }
 
     if (transA == slate::Op::Trans)
@@ -129,14 +134,6 @@ void test_gemm_work(Params& params, bool run)
     slate_assert(B.nt() == C.nt());
     slate_assert(A.nt() == B.mt());
 
-    // If reference run is required, record norms to be used in the check/ref.
-    real_t A_norm=0, B_norm=0, C_orig_norm=0;
-    if (ref) {
-        A_norm = slate::norm(norm, A);
-        B_norm = slate::norm(norm, B);
-        C_orig_norm = slate::norm(norm, Cref);
-    }
-
     // If check run, perform first half of SLATE residual check.
     TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
diff --git a/test/test_hemm.cc b/test/test_hemm.cc
index b71708064..a31612b53 100644
--- a/test/test_hemm.cc
+++ b/test/test_hemm.cc
@@ -110,13 +110,11 @@ void test_hemm_work(Params& params, bool run)
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         B_norm = slate::norm(norm, B);
         C_orig_norm = slate::norm(norm, Cref);
diff --git a/test/test_her2k.cc b/test/test_her2k.cc
index 3c0ec777e..fc8d4077b 100644
--- a/test/test_her2k.cc
+++ b/test/test_her2k.cc
@@ -103,13 +103,11 @@ void test_her2k_work(Params& params, bool run)
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         B_norm = slate::norm(norm, B);
         C_orig_norm = slate::norm(norm, Cref);
diff --git a/test/test_herk.cc b/test/test_herk.cc
index 9435bb550..d3f9144ba 100644
--- a/test/test_herk.cc
+++ b/test/test_herk.cc
@@ -94,13 +94,11 @@ void test_herk_work(Params& params, bool run)
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, C );
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         C_orig_norm = slate::norm(norm, Cref);
     }
diff --git a/test/test_symm.cc b/test/test_symm.cc
index 50bb601ec..e463b5d34 100644
--- a/test/test_symm.cc
+++ b/test/test_symm.cc
@@ -99,13 +99,11 @@ void test_symm_work(Params& params, bool run)
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         B_norm = slate::norm(norm, B);
         C_orig_norm = slate::norm(norm, Cref);
diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc
index e3fa8cc22..a42aaf6c9 100644
--- a/test/test_syr2k.cc
+++ b/test/test_syr2k.cc
@@ -101,13 +101,11 @@ void test_syr2k_work(Params& params, bool run)
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         B_norm = slate::norm(norm, B);
         C_orig_norm = slate::norm(norm, Cref);
diff --git a/test/test_syrk.cc b/test/test_syrk.cc
index c8683b837..bc8e6e1d0 100644
--- a/test/test_syrk.cc
+++ b/test/test_syrk.cc
@@ -92,13 +92,11 @@ void test_syrk_work(Params& params, bool run)
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixC, C );
 
-    if (ref) {
-        slate::copy( C, Cref );
-    }
-
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, C_orig_norm=0;
     if (ref) {
+        slate::copy( C, Cref );
+
         A_norm = slate::norm(norm, A);
         C_orig_norm = slate::norm(norm, Cref);
     }

From 8b00be41e64f5b50b9600414fe0f2cf9f29e820c Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 19 Dec 2023 12:27:17 -0500
Subject: [PATCH 13/33] Add options to trmm and trsm

---
 test/matrix_utils.hh | 115 ++++++++++++++++++++++++++++++
 test/run_tests.py    |   8 +--
 test/test_trmm.cc    | 162 ++++++++++++++++---------------------------
 test/test_trsm.cc    | 115 +++++++++---------------------
 4 files changed, 211 insertions(+), 189 deletions(-)

diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 59f9f3aa4..27fda8bac 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -539,4 +539,119 @@ TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
                 ref_matrix, nonuniform_ref, n, params );
 }
 
+// -----------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_HermitianMatrix
+inline void mark_params_for_test_TriangularMatrix(Params& params)
+{
+    params.grid.m();
+    params.grid.n();
+    params.dev_dist();
+    params.uplo();
+    params.diag();
+    params.nb();
+    params.nonuniform_nb();
+    params.origin();
+    params.grid_order();
+}
+
+// -----------------------------------------------------------------------------
+/// Allocates a SymmetricMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    slate::Diag diag = params.diag();
+    int p = params.grid.m();
+    int q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<slate::TriangularMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
+
+    // Functions for nonuniform tile sizes or row device distribution
+    std::function< int64_t (int64_t j) > tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+    }
+    else {
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+            matrix.A = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD);
+        }
+
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        matrix.A.insertLocalTiles(origin_target);
+    }
+    else {
+        assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                    uplo, diag, n, &matrix.A_data[0], matrix.lld, nb,
+                    grid_order, p, q, MPI_COMM_WORLD);
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.Aref = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                            uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb,
+                            grid_order, p, q, MPI_COMM_WORLD);
+        }
+    }
+
+    return matrix;
+}
+
+
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/run_tests.py b/test/run_tests.py
index 584b96bf7..1de26268b 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -378,11 +378,11 @@ def filter_csv( values, csv ):
     # todo: tbsm fails for nb=8 or 16 with --quick.
     [ 'tbsm',  gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ],
 
-    [ 'trmm',  gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
+    [ 'trmm',  gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
 
-    [ 'trsm',  gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
-    [ 'trsmA', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
-    [ 'trsmB', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
+    [ 'trsm',  gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
+    [ 'trsmA', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
+    [ 'trsmB', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
     ]
 
 # LU
diff --git a/test/test_trmm.cc b/test/test_trmm.cc
index 26e0b3b64..941ee2ecb 100644
--- a/test/test_trmm.cc
+++ b/test/test_trmm.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,9 +40,6 @@ void test_trmm_work(Params& params, bool run)
     int64_t nrhs = params.nrhs();
     scalar_t alpha = params.alpha.get<scalar_t>();
     slate::Op transB = params.transB();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -51,6 +50,9 @@ void test_trmm_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_TriangularMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -65,6 +67,11 @@ void test_trmm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -79,60 +86,24 @@ void test_trmm_work(Params& params, bool run)
     int64_t Bm = (transB == slate::Op::NoTrans ? m : n);
     int64_t Bn = (transB == slate::Op::NoTrans ? n : m);
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-    }
-
-    slate::TriangularMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::TriangularMatrix<scalar_t>(uplo, diag, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_TriangularMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( ref, true, Bm, Bn, params );
 
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& Bref      = B_alloc.Aref;
 
     generate_matrix( params.matrix, A );
     generate_matrix( params.matrixB, B );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        std::vector<scalar_t> Bref_data;
-        if (ref) {
-            Bref_data.resize( lldB * nlocB );
-            auto Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                            Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( B, Bref );
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_orig_norm=0;
+    if (ref) {
+        slate::copy( B, Bref );
+
+        A_norm = slate::norm(norm, A);
+        B_orig_norm = slate::norm(norm, B);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -148,14 +119,16 @@ void test_trmm_work(Params& params, bool run)
         B = conj_transpose( B );
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, X2, Y;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, X2_alloc, Y_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        X2 = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X2.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        X2_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& X2 = X2_alloc.A;
+        auto& Y = Y_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
         generate_matrix( mp, X );
@@ -207,6 +180,9 @@ void test_trmm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, B*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -224,48 +200,27 @@ void test_trmm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], Bref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            if (origin != slate::Origin::ScaLAPACK) {
-                // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( B, &B_data[0], B_desc );
-            }
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
-            // allocate workspace for norms
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
-            std::vector<real_t> worklange(std::max(mlocB, nlocB));
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& Bref_data = B_alloc.Aref_data;
 
-            // get norms of the original data
-            real_t A_norm = scalapack_plantr(
-                norm2str(norm), uplo2str(uplo), diag2str(diag), Am, An, &A_data[0],
-                1, 1, A_desc, &worklantr[0]);
-            real_t B_orig_norm = scalapack_plange(
-                norm2str(norm), Bm, Bn, &Bref_data[0], 1, 1, B_desc, &worklange[0]);
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+
+                // Copy SLATE matrix into ScaLAPACK matrix
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -277,12 +232,11 @@ void test_trmm_work(Params& params, bool run)
                             &Bref_data[0], 1, 1, Bref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Bref_data - B_data
-            blas::axpy(Bref_data.size(), -1.0, &B_data[0], 1, &Bref_data[0], 1);
+            // get differences B = B - Bref
+            slate::add(-one, Bref, one, B);
 
-            // norm(Bref_data - B_data)
-            real_t B_diff_norm = scalapack_plange(norm2str(norm), Bm, Bn, &Bref_data[0],
-                                                  1, 1, Bref_desc, &worklange[0]);
+            // norm(B - Bref)
+            real_t B_diff_norm = slate::norm(norm, B);
 
             real_t error = B_diff_norm
                          / (sqrt(real_t(Am) + 2) * std::abs(alpha) * A_norm * B_orig_norm);
@@ -298,7 +252,7 @@ void test_trmm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_trsm.cc b/test/test_trsm.cc
index 24e3d3360..5f4bd7045 100644
--- a/test/test_trsm.cc
+++ b/test/test_trsm.cc
@@ -6,12 +6,14 @@
 #include "slate/slate.hh"
 #include "test.hh"
 #include "blas/flops.hh"
+#include "print_matrix.hh"
+
+#include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "print_matrix.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -44,9 +46,6 @@ void test_trsm_work(Params& params, bool run)
     int64_t m = params.dim.m();
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -58,6 +57,9 @@ void test_trsm_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_TriangularMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -72,6 +74,11 @@ void test_trsm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -87,49 +94,12 @@ void test_trsm_work(Params& params, bool run)
     int64_t Bm  = m;
     int64_t Bn  = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-    }
+    auto A_alloc = allocate_test_TriangularMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, Bm, Bn, params );
 
-    slate::TriangularMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::TriangularMatrix<scalar_t>(
-                uplo, diag, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(
-                Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-
-        B_data.resize( lldB * nlocB );
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& Bref      = B_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
@@ -140,13 +110,8 @@ void test_trsm_work(Params& params, bool run)
     auto AH = slate::HermitianMatrix<scalar_t>( A );
     slate::potrf( AH, opts );
 
-    // if check is required, copy test data
-    std::vector< scalar_t > Bref_data;
-    slate::Matrix<scalar_t> Bref;
+    // If reference run is required, record norms to be used in the check/ref.
     if (check || ref) {
-        Bref_data.resize( lldB * nlocB );
-        Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
         slate::copy( B, Bref );
     }
 
@@ -220,34 +185,22 @@ void test_trsm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK for timing only
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
+            blas_int ictxt, A_desc[9], Bref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& Bref_data = B_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+                // Copy SLATE matrix into ScaLAPACK matrix
+                copy(A, &A_data[0], A_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.

From 782c4821750916d979844ffbe24b833973e0f8b6 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 19 Dec 2023 13:19:07 -0500
Subject: [PATCH 14/33] Move allocate_test_* functions to cc file

---
 GNUmakefile          |   1 +
 test/matrix_utils.cc | 475 +++++++++++++++++++++++++++++++++++++++++++
 test/matrix_utils.hh | 365 ++-------------------------------
 3 files changed, 494 insertions(+), 347 deletions(-)
 create mode 100644 test/matrix_utils.cc

diff --git a/GNUmakefile b/GNUmakefile
index 604394716..b71c10c0c 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -702,6 +702,7 @@ endif
 tester_src += \
         test/matrix_generator.cc \
         test/matrix_params.cc \
+        test/matrix_utils.cc \
         test/random.cc \
         test/test.cc \
         test/test_add.cc \
diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
new file mode 100644
index 000000000..b157e97ee
--- /dev/null
+++ b/test/matrix_utils.cc
@@ -0,0 +1,475 @@
+// Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "matrix_utils.hh"
+
+//------------------------------------------------------------------------------
+/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    int p = params.grid.m();
+    int q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<slate::Matrix<scalar_t>> matrix( m, n, nb, p, q, grid_order );
+
+    // Functions for nonuniform tile sizes or row device distribution
+    std::function< int64_t (int64_t j) > tileMb, tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+        tileMb = tileNb;
+    }
+    else {
+        tileMb = slate::func::uniform_blocksize( m, nb );
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target( origin );
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+            matrix.A = slate::Matrix<scalar_t>( m, n, tileMb, tileNb, tileRank,
+                                                tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = slate::Matrix<scalar_t>( m, n, nb, nb,
+                                                grid_order, p, q, MPI_COMM_WORLD );
+        }
+        matrix.A.insertLocalTiles( origin_target );
+    }
+    else {
+        assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
+                    m, n, &matrix.A_data[0], matrix.lld, nb, nb,
+                    grid_order, p, q, MPI_COMM_WORLD );
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
+                                                   tileDevice, MPI_COMM_WORLD );
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
+                       m, n, &matrix.Aref_data[0], matrix.lld, nb, nb,
+                       grid_order, p, q, MPI_COMM_WORLD );
+        }
+    }
+
+    return matrix;
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::Matrix<float>> allocate_test_Matrix<float>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::Matrix<double>> allocate_test_Matrix<double>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::Matrix<std::complex<float>>> allocate_test_Matrix<std::complex<float>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::Matrix<std::complex<double>>> allocate_test_Matrix<std::complex<double>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params);
+
+//------------------------------------------------------------------------------
+/// Helper routine to avoid duplicating logic between HermitianMatrix
+/// and SymmetricMatrix
+///
+template <typename matrixtype>
+TestMatrix<matrixtype> allocate_test_HeSyMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    int p = params.grid.m();
+    int q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<matrixtype> matrix ( n, n, nb, p, q, grid_order );
+
+    // Functions for nonuniform tile sizes or row device distribution
+    std::function< int64_t (int64_t j) > tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+    }
+    else {
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+            matrix.A = matrixtype(
+                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = matrixtype(
+                    uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD);
+        }
+
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        matrix.A.insertLocalTiles(origin_target);
+    }
+    else {
+        assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = matrixtype::fromScaLAPACK(
+                    uplo, n, &matrix.A_data[0], matrix.lld, nb,
+                    grid_order, p, q, MPI_COMM_WORLD);
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.A = matrixtype(
+                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = matrixtype::fromScaLAPACK(
+                            uplo, n, &matrix.Aref_data[0], matrix.lld, nb,
+                            grid_order, p, q, MPI_COMM_WORLD);
+        }
+    }
+
+    return matrix;
+}
+
+//------------------------------------------------------------------------------
+/// Allocates a HermitianMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::HermitianMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::HermitianMatrix<float>> allocate_test_HermitianMatrix<float>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<double>> allocate_test_HermitianMatrix<double>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<std::complex<float>>> allocate_test_HermitianMatrix<std::complex<float>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<std::complex<double>>> allocate_test_HermitianMatrix<std::complex<double>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+//------------------------------------------------------------------------------
+/// Allocates a SymmetricMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::SymmetricMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::SymmetricMatrix<float>> allocate_test_SymmetricMatrix<float>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<double>> allocate_test_SymmetricMatrix<double>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<std::complex<float>>> allocate_test_SymmetricMatrix<std::complex<float>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_SymmetricMatrix<std::complex<double>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+
+//------------------------------------------------------------------------------
+/// Allocates a SymmetricMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    slate::Diag diag = params.diag();
+    int p = params.grid.m();
+    int q = params.grid.n();
+    slate::Dist dev_dist = params.dev_dist();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+
+    // The object to be returned
+    TestMatrix<slate::TriangularMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
+
+    // Functions for nonuniform tile sizes or row device distribution
+    std::function< int64_t (int64_t j) > tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+    }
+    else {
+        tileNb = slate::func::uniform_blocksize( n, nb );
+    }
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
+                                                   p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+            matrix.A = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+        }
+        else {
+            matrix.A = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD);
+        }
+
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target(origin);
+        matrix.A.insertLocalTiles(origin_target);
+    }
+    else {
+        assert( !nonuniform_nb );
+        assert( dev_dist == slate::Dist::Row );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                    uplo, diag, n, &matrix.A_data[0], matrix.lld, nb,
+                    grid_order, p, q, MPI_COMM_WORLD);
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.Aref = slate::TriangularMatrix<scalar_t>(
+                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                            uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb,
+                            grid_order, p, q, MPI_COMM_WORLD);
+        }
+    }
+
+    return matrix;
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::TriangularMatrix<float>> allocate_test_TriangularMatrix<float>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<double>> allocate_test_TriangularMatrix<double>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<std::complex<float>>> allocate_test_TriangularMatrix<std::complex<float>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<std::complex<double>>> allocate_test_TriangularMatrix<std::complex<double>>(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t n,
+        Params& params);
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 27fda8bac..b3db03c62 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -7,6 +7,7 @@
 #define SLATE_MATRIX_UTILS_HH
 
 #include "slate/slate.hh"
+#include "test.hh"
 
 #include "scalapack_wrappers.hh"
 #include "grid_utils.hh"
@@ -272,386 +273,56 @@ inline void mark_params_for_test_Matrix(Params& params)
 }
 
 //------------------------------------------------------------------------------
-/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
-///
-/// @param[in] ref_matrix
-///     Whether to allocate a reference matrix
-///
-/// @param[in] nonuniform_ref
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param[in] m
-///     The number of rows
-///
-/// @param[in] n
-///     The number of columns
-///
-/// @param[in] params
-///     The test params object which contains many of the key parameters
-///
-template <typename scalar_t>
-TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params)
-{
-    // Load params variables
-    int p = params.grid.m();
-    int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<slate::Matrix<scalar_t>> matrix( m, n, nb, p, q, grid_order );
-
-    // Functions for nonuniform tile sizes.
-    // Odd-numbered tiles are 2*nb, even-numbered tiles are nb.
-    std::function< int64_t (int64_t j) > tileMb, tileNb;
-    if (nonuniform_nb) {
-        tileNb = [nb](int64_t j) {
-            // for non-uniform tile size
-            return (j % 2 != 0 ? nb*2 : nb);
-        };
-        tileMb = tileNb;
-    }
-    else {
-        tileMb = slate::func::uniform_blocksize( m, nb );
-        tileNb = slate::func::uniform_blocksize( n, nb );
-    }
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        if (nonuniform_nb) {
-            params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) )
-                         + ", "             + std::to_string( tileNb( 1 ) );
-        }
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = slate::Matrix<scalar_t>( m, n, tileMb, tileNb, tileRank,
-                                                tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = slate::Matrix<scalar_t>( m, n, nb, nb,
-                                                grid_order, p, q, MPI_COMM_WORLD );
-        }
-        matrix.A.insertLocalTiles( origin_target );
-    }
-    else {
-        assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &matrix.A_data[0], matrix.lld, nb, nb,
-                    grid_order, p, q, MPI_COMM_WORLD );
-    }
-
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
-                                                   tileDevice, MPI_COMM_WORLD );
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       m, n, &matrix.Aref_data[0], matrix.lld, nb, nb,
-                       grid_order, p, q, MPI_COMM_WORLD );
-        }
-    }
-
-    return matrix;
-}
-
-// -----------------------------------------------------------------------------
 /// Marks the paramters used by allocate_test_HermitianMatrix
 inline void mark_params_for_test_HermitianMatrix(Params& params)
 {
-    params.grid.m();
-    params.grid.n();
-    params.dev_dist();
     params.uplo();
-    params.nb();
-    params.nonuniform_nb();
-    params.origin();
-    params.grid_order();
+    mark_params_for_test_Matrix( params );
 }
 
-// -----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// Marks the paramters used by allocate_test_SymmetricMatrix
 inline void mark_params_for_test_SymmetricMatrix(Params& params)
 {
     mark_params_for_test_HermitianMatrix( params );
 }
 
+//------------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_HermitianMatrix
+inline void mark_params_for_test_TriangularMatrix(Params& params)
+{
+    params.uplo();
+    params.diag();
+    mark_params_for_test_Matrix( params );
+}
 
-// -----------------------------------------------------------------------------
-/// Helper routine to avoid duplicating logic between HermitianMatrix and SymmetricMatrix
-///
-template <typename matrixtype>
-TestMatrix<matrixtype> allocate_test_HeSyMatrix(
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         bool ref_matrix,
         bool nonuniform_ref,
+        int64_t m,
         int64_t n,
-        Params& params)
-{
-    // Load params variables
-    slate::Uplo uplo = params.uplo();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<matrixtype> matrix ( n, n, nb, p, q, grid_order );
-
-    // Functions for nonuniform tile sizes or row device distribution
-    std::function< int64_t (int64_t j) > tileNb;
-    if (nonuniform_nb) {
-        tileNb = [nb](int64_t j) {
-            // for non-uniform tile size
-            return (j % 2 != 0 ? nb*2 : nb);
-        };
-    }
-    else {
-        tileNb = slate::func::uniform_blocksize( n, nb );
-    }
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = matrixtype(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = matrixtype(
-                    uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD);
-        }
+        Params& params);
 
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        matrix.A.insertLocalTiles(origin_target);
-    }
-    else {
-        assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = matrixtype::fromScaLAPACK(
-                    uplo, n, &matrix.A_data[0], matrix.lld, nb,
-                    grid_order, p, q, MPI_COMM_WORLD);
-    }
-
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.A = matrixtype(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = matrixtype::fromScaLAPACK(
-                            uplo, n, &matrix.Aref_data[0], matrix.lld, nb,
-                            grid_order, p, q, MPI_COMM_WORLD);
-        }
-    }
-
-    return matrix;
-}
-
-// -----------------------------------------------------------------------------
-/// Allocates a HermitianMatrix<scalar_t> and a reference version for testing.
-///
-/// @param ref_matrix[in]
-///     Whether to allocate a reference matrix
-///
-/// @param nonuniform_ref[in]
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param m[in]
-///     The number of rows
-///
-/// @param n[in]
-///     The number of columns
-///
-/// @param params[in]
-///     The test params object which contains many of the key parameters
-///
 template <typename scalar_t>
 TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t n,
-        Params& params)
-{
-    return allocate_test_HeSyMatrix<slate::HermitianMatrix<scalar_t>>(
-                ref_matrix, nonuniform_ref, n, params );
-}
+        Params& params);
 
-// -----------------------------------------------------------------------------
-/// Allocates a SymmetricMatrix<scalar_t> and a reference version for testing.
-///
-/// @param ref_matrix[in]
-///     Whether to allocate a reference matrix
-///
-/// @param nonuniform_ref[in]
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param m[in]
-///     The number of rows
-///
-/// @param n[in]
-///     The number of columns
-///
-/// @param params[in]
-///     The test params object which contains many of the key parameters
-///
 template <typename scalar_t>
 TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t n,
-        Params& params)
-{
-    return allocate_test_HeSyMatrix<slate::SymmetricMatrix<scalar_t>>(
-                ref_matrix, nonuniform_ref, n, params );
-}
+        Params& params);
 
-// -----------------------------------------------------------------------------
-/// Marks the paramters used by allocate_test_HermitianMatrix
-inline void mark_params_for_test_TriangularMatrix(Params& params)
-{
-    params.grid.m();
-    params.grid.n();
-    params.dev_dist();
-    params.uplo();
-    params.diag();
-    params.nb();
-    params.nonuniform_nb();
-    params.origin();
-    params.grid_order();
-}
-
-// -----------------------------------------------------------------------------
-/// Allocates a SymmetricMatrix<scalar_t> and a reference version for testing.
-///
-/// @param ref_matrix[in]
-///     Whether to allocate a reference matrix
-///
-/// @param nonuniform_ref[in]
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param m[in]
-///     The number of rows
-///
-/// @param n[in]
-///     The number of columns
-///
-/// @param params[in]
-///     The test params object which contains many of the key parameters
-///
 template <typename scalar_t>
 TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t n,
-        Params& params)
-{
-    // Load params variables
-    slate::Uplo uplo = params.uplo();
-    slate::Diag diag = params.diag();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<slate::TriangularMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
-
-    // Functions for nonuniform tile sizes or row device distribution
-    std::function< int64_t (int64_t j) > tileNb;
-    if (nonuniform_nb) {
-        tileNb = [nb](int64_t j) {
-            // for non-uniform tile size
-            return (j % 2 != 0 ? nb*2 : nb);
-        };
-    }
-    else {
-        tileNb = slate::func::uniform_blocksize( n, nb );
-    }
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD);
-        }
-
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        matrix.A.insertLocalTiles(origin_target);
-    }
-    else {
-        assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                    uplo, diag, n, &matrix.A_data[0], matrix.lld, nb,
-                    grid_order, p, q, MPI_COMM_WORLD);
-    }
-
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                            uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb,
-                            grid_order, p, q, MPI_COMM_WORLD);
-        }
-    }
-
-    return matrix;
-}
-
+        Params& params);
 
 #endif // SLATE_MATRIX_UTILS_HH

From cca1384c07cf8d4f4bbcce63d7f51ac39b9a4b46 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 19 Dec 2023 14:08:04 -0500
Subject: [PATCH 15/33] Deduplicate code in matrix allocate

---
 test/matrix_utils.cc | 290 +++++++++++++++++--------------------------
 1 file changed, 116 insertions(+), 174 deletions(-)

diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
index b157e97ee..2e126149c 100644
--- a/test/matrix_utils.cc
+++ b/test/matrix_utils.cc
@@ -5,32 +5,22 @@
 
 #include "matrix_utils.hh"
 
+using nb_func_t = std::function< int64_t(int64_t) >;
+using dist_func_t = std::function< int(std::tuple<int64_t, int64_t>) >;
+
 //------------------------------------------------------------------------------
-/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
-///
-/// @param[in] ref_matrix
-///     Whether to allocate a reference matrix
-///
-/// @param[in] nonuniform_ref
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param[in] m
-///     The number of rows
-///
-/// @param[in] n
-///     The number of columns
-///
-/// @param[in] params
-///     The test params object which contains many of the key parameters
-///
-template <typename scalar_t>
-TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+/// Shared logic of the allocate_test_* routines
+template <typename matrix_type, typename irregular_constructor_t,
+          typename regular_constructor_t, typename scalapack_constructor_t>
+static TestMatrix<matrix_type> allocate_test_shared(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t m,
         int64_t n,
-        Params& params)
+        Params& params,
+        irregular_constructor_t construct_irregular,
+        regular_constructor_t construct_regular,
+        scalapack_constructor_t construct_scalapack)
 {
     // Load params variables
     int p = params.grid.m();
@@ -42,10 +32,10 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     slate::GridOrder grid_order = params.grid_order();
 
     // The object to be returned
-    TestMatrix<slate::Matrix<scalar_t>> matrix( m, n, nb, p, q, grid_order );
+    TestMatrix<matrix_type> matrix( m, n, nb, p, q, grid_order );
 
     // Functions for nonuniform tile sizes or row device distribution
-    std::function< int64_t (int64_t j) > tileMb, tileNb;
+    nb_func_t tileMb, tileNb;
     if (nonuniform_nb) {
         tileNb = [nb](int64_t j) {
             // for non-uniform tile size
@@ -54,8 +44,8 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         tileMb = tileNb;
     }
     else {
-        tileMb = slate::func::uniform_blocksize( m, nb );
         tileNb = slate::func::uniform_blocksize( n, nb );
+        tileMb = (m == n) ? tileNb : slate::func::uniform_blocksize( m, nb );
     }
     auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
     int num_devices_ = blas::get_device_count();
@@ -67,12 +57,10 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         // SLATE allocates CPU or GPU tiles.
         slate::Target origin_target = origin2target( origin );
         if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = slate::Matrix<scalar_t>( m, n, tileMb, tileNb, tileRank,
-                                                tileDevice, MPI_COMM_WORLD);
+            matrix.A = construct_irregular( tileMb, tileNb, tileRank, tileDevice );
         }
         else {
-            matrix.A = slate::Matrix<scalar_t>( m, n, nb, nb,
-                                                grid_order, p, q, MPI_COMM_WORLD );
+            matrix.A = construct_regular( nb, grid_order, p, q );
         }
         matrix.A.insertLocalTiles( origin_target );
     }
@@ -81,30 +69,75 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
         assert( dev_dist == slate::Dist::Row );
         // Create SLATE matrix from the ScaLAPACK layouts
         matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &matrix.A_data[0], matrix.lld, nb, nb,
-                    grid_order, p, q, MPI_COMM_WORLD );
+        matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb,
+                                        grid_order, p, q );
     }
 
     // Setup reference matrix
     if (ref_matrix) {
         if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
-                                                   tileDevice, MPI_COMM_WORLD );
+            matrix.Aref = construct_irregular( tileMb, tileNb, tileRank, tileDevice );
             matrix.Aref.insertLocalTiles( slate::Target::Host );
         }
         else {
             matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       m, n, &matrix.Aref_data[0], matrix.lld, nb, nb,
-                       grid_order, p, q, MPI_COMM_WORLD );
+            matrix.Aref = construct_scalapack( &matrix.Aref_data[0], matrix.lld, nb,
+                                           grid_order, p, q );
         }
     }
 
     return matrix;
 }
 
-//------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
+/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+        bool ref_matrix,
+        bool nonuniform_ref,
+        int64_t m,
+        int64_t n,
+        Params& params)
+{
+    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::Matrix<scalar_t>( m, n, tileMb, tileNb,
+                                        tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::Matrix<scalar_t>( m, n, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::Matrix<scalar_t>::fromScaLAPACK(
+                                            m, n, data, lld, nb, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::Matrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, m, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
 // Explicit instantiations.
 template
 TestMatrix<slate::Matrix<float>> allocate_test_Matrix<float>(
@@ -142,83 +175,37 @@ TestMatrix<slate::Matrix<std::complex<double>>> allocate_test_Matrix<std::comple
 /// Helper routine to avoid duplicating logic between HermitianMatrix
 /// and SymmetricMatrix
 ///
-template <typename matrixtype>
-TestMatrix<matrixtype> allocate_test_HeSyMatrix(
+template <typename matrix_type>
+TestMatrix<matrix_type> allocate_test_HeSyMatrix(
         bool ref_matrix,
         bool nonuniform_ref,
         int64_t n,
         Params& params)
 {
-    // Load params variables
-    slate::Uplo uplo = params.uplo();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<matrixtype> matrix ( n, n, nb, p, q, grid_order );
-
-    // Functions for nonuniform tile sizes or row device distribution
-    std::function< int64_t (int64_t j) > tileNb;
-    if (nonuniform_nb) {
-        tileNb = [nb](int64_t j) {
-            // for non-uniform tile size
-            return (j % 2 != 0 ? nb*2 : nb);
-        };
-    }
-    else {
-        tileNb = slate::func::uniform_blocksize( n, nb );
-    }
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = matrixtype(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = matrixtype(
-                    uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD);
-        }
-
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        matrix.A.insertLocalTiles(origin_target);
-    }
-    else {
-        assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = matrixtype::fromScaLAPACK(
-                    uplo, n, &matrix.A_data[0], matrix.lld, nb,
-                    grid_order, p, q, MPI_COMM_WORLD);
-    }
+    using scalar_t = typename matrix_type::value_type;
 
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.A = matrixtype(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = matrixtype::fromScaLAPACK(
-                            uplo, n, &matrix.Aref_data[0], matrix.lld, nb,
-                            grid_order, p, q, MPI_COMM_WORLD);
-        }
-    }
+    slate::Uplo uplo = params.uplo();
 
-    return matrix;
+    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return matrix_type( uplo, n, tileNb,
+                            tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return matrix_type( uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return matrix_type::fromScaLAPACK( uplo, n, data, lld, nb,
+                                           grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<matrix_type>(
+                    ref_matrix, nonuniform_ref, n, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
 }
 
 //------------------------------------------------------------------------------
@@ -374,74 +361,29 @@ TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
     // Load params variables
     slate::Uplo uplo = params.uplo();
     slate::Diag diag = params.diag();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<slate::TriangularMatrix<scalar_t>> matrix ( n, n, nb, p, q, grid_order );
 
-    // Functions for nonuniform tile sizes or row device distribution
-    std::function< int64_t (int64_t j) > tileNb;
-    if (nonuniform_nb) {
-        tileNb = [nb](int64_t j) {
-            // for non-uniform tile size
-            return (j % 2 != 0 ? nb*2 : nb);
-        };
-    }
-    else {
-        tileNb = slate::func::uniform_blocksize( n, nb );
-    }
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD);
-        }
-
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        matrix.A.insertLocalTiles(origin_target);
-    }
-    else {
-        assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                    uplo, diag, n, &matrix.A_data[0], matrix.lld, nb,
-                    grid_order, p, q, MPI_COMM_WORLD);
-    }
-
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = slate::TriangularMatrix<scalar_t>(
-                    uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                            uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb,
-                            grid_order, p, q, MPI_COMM_WORLD);
-        }
-    }
-
-    return matrix;
+    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::TriangularMatrix<scalar_t>( uplo, diag, n, tileNb,
+                                                  tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TriangularMatrix<scalar_t>( uplo, diag, n, nb,
+                                                  grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                                            uplo, diag, n, data, lld, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::TriangularMatrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, n, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
 }
 
 //------------------------------------------------------------------------------

From 3783b19e316e7991644ee8f8b1cdf01dac264973 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 19 Dec 2023 14:13:01 -0500
Subject: [PATCH 16/33] Tighten some function signatures

---
 test/matrix_utils.cc | 122 +++++++++----------------------------------
 test/matrix_utils.hh |  21 ++------
 2 files changed, 29 insertions(+), 114 deletions(-)

diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
index 2e126149c..df9c1022a 100644
--- a/test/matrix_utils.cc
+++ b/test/matrix_utils.cc
@@ -13,14 +13,10 @@ using dist_func_t = std::function< int(std::tuple<int64_t, int64_t>) >;
 template <typename matrix_type, typename irregular_constructor_t,
           typename regular_constructor_t, typename scalapack_constructor_t>
 static TestMatrix<matrix_type> allocate_test_shared(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params,
-        irregular_constructor_t construct_irregular,
-        regular_constructor_t construct_regular,
-        scalapack_constructor_t construct_scalapack)
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params,
+    irregular_constructor_t construct_irregular,
+    regular_constructor_t construct_regular,
+    scalapack_constructor_t construct_scalapack)
 {
     // Load params variables
     int p = params.grid.m();
@@ -110,11 +106,7 @@ static TestMatrix<matrix_type> allocate_test_shared(
 ///
 template <typename scalar_t>
 TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params)
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params)
 {
     auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
                                     dist_func_t tileRank, dist_func_t tileDevice)
@@ -141,35 +133,19 @@ TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
 // Explicit instantiations.
 template
 TestMatrix<slate::Matrix<float>> allocate_test_Matrix<float>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
 
 template
 TestMatrix<slate::Matrix<double>> allocate_test_Matrix<double>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
 
 template
 TestMatrix<slate::Matrix<std::complex<float>>> allocate_test_Matrix<std::complex<float>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
 
 template
 TestMatrix<slate::Matrix<std::complex<double>>> allocate_test_Matrix<std::complex<double>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
 
 //------------------------------------------------------------------------------
 /// Helper routine to avoid duplicating logic between HermitianMatrix
@@ -177,10 +153,7 @@ TestMatrix<slate::Matrix<std::complex<double>>> allocate_test_Matrix<std::comple
 ///
 template <typename matrix_type>
 TestMatrix<matrix_type> allocate_test_HeSyMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params)
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
 {
     using scalar_t = typename matrix_type::value_type;
 
@@ -230,10 +203,7 @@ TestMatrix<matrix_type> allocate_test_HeSyMatrix(
 ///
 template <typename scalar_t>
 TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params)
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
 {
     return allocate_test_HeSyMatrix<slate::HermitianMatrix<scalar_t>>(
                 ref_matrix, nonuniform_ref, n, params );
@@ -243,31 +213,19 @@ TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
 // Explicit instantiations.
 template
 TestMatrix<slate::HermitianMatrix<float>> allocate_test_HermitianMatrix<float>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::HermitianMatrix<double>> allocate_test_HermitianMatrix<double>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::HermitianMatrix<std::complex<float>>> allocate_test_HermitianMatrix<std::complex<float>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::HermitianMatrix<std::complex<double>>> allocate_test_HermitianMatrix<std::complex<double>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 //------------------------------------------------------------------------------
 /// Allocates a SymmetricMatrix<scalar_t> and optionally a reference
@@ -291,10 +249,7 @@ TestMatrix<slate::HermitianMatrix<std::complex<double>>> allocate_test_Hermitian
 ///
 template <typename scalar_t>
 TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params)
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
 {
     return allocate_test_HeSyMatrix<slate::SymmetricMatrix<scalar_t>>(
                 ref_matrix, nonuniform_ref, n, params );
@@ -304,31 +259,19 @@ TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
 // Explicit instantiations.
 template
 TestMatrix<slate::SymmetricMatrix<float>> allocate_test_SymmetricMatrix<float>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::SymmetricMatrix<double>> allocate_test_SymmetricMatrix<double>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::SymmetricMatrix<std::complex<float>>> allocate_test_SymmetricMatrix<std::complex<float>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_SymmetricMatrix<std::complex<double>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 
 //------------------------------------------------------------------------------
@@ -353,10 +296,7 @@ TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_Symmetric
 ///
 template <typename scalar_t>
 TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params)
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
 {
     // Load params variables
     slate::Uplo uplo = params.uplo();
@@ -390,28 +330,16 @@ TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
 // Explicit instantiations.
 template
 TestMatrix<slate::TriangularMatrix<float>> allocate_test_TriangularMatrix<float>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::TriangularMatrix<double>> allocate_test_TriangularMatrix<double>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::TriangularMatrix<std::complex<float>>> allocate_test_TriangularMatrix<std::complex<float>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template
 TestMatrix<slate::TriangularMatrix<std::complex<double>>> allocate_test_TriangularMatrix<std::complex<double>>(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index b3db03c62..f92576c84 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -298,31 +298,18 @@ inline void mark_params_for_test_TriangularMatrix(Params& params)
 
 template <typename scalar_t>
 TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
 
 template <typename scalar_t>
 TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template <typename scalar_t>
 TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 template <typename scalar_t>
 TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t n,
-        Params& params);
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
 #endif // SLATE_MATRIX_UTILS_HH

From 88b497e3e86417ef6d1d85868dffe07159b79039 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 19 Dec 2023 14:27:38 -0500
Subject: [PATCH 17/33] Refactor some test/run_tests.py parameters into
 variables

---
 test/run_tests.py | 137 ++++++++++++++++++------------------
 test/test_svd.cc  | 173 +++++++++++++---------------------------------
 2 files changed, 119 insertions(+), 191 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index 1de26268b..dcf5d7324 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -316,6 +316,11 @@
 gen_no_nb = origin + target + grid + check + ref + tol + repeat
 gen_no_target =               grid + check + ref + tol + repeat + nb
 
+ge_matrix = ddist + grid_order
+sy_matrix = uplo + ddist + grid_order
+he_matrix = uplo + ddist + grid_order
+tr_matrix = uplo + diag + ddist + grid_order
+
 if (opts.matrix):
     gen += matrix
 
@@ -350,67 +355,67 @@ def filter_csv( values, csv ):
     cmds += [
     [ 'gbmm',  gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ],
 
-    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
-    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
-    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ],
+    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
+    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
+    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
 
-    [ 'hemm',  gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC ],
+    [ 'hemm',  gen + dtype         + la + side + he_matrix     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
-    [ 'hemmA', gen_no_target + dtype + la + side + uplo + ddist + grid_order   + mn + ab + matrixBC ],
-    [ 'hemmC', gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC],
+    [ 'hemmA', gen_no_target + dtype + la + side + he_matrix   + mn + ab + matrixBC ],
+    [ 'hemmC', gen + dtype         + la + side + he_matrix     + mn + ab + matrixBC],
 
     [ 'hbmm',  gen + dtype         + la + side + uplo     + mn + ab + kd + matrixBC ],
 
-    [ 'herk',  gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixC ],
-    [ 'herk',  gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_real    + la + he_matrix + trans    + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixC ],
 
-    [ 'her2k', gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixBC ],
-    [ 'her2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_real    + la + he_matrix + trans    + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixBC ],
 
-    [ 'symm',  gen + dtype         + la + side + uplo + ddist + grid_order     + mn + ab + matrixBC ],
+    [ 'symm',  gen + dtype         + la + side + sy_matrix     + mn + ab + matrixBC ],
 
-    [ 'syr2k', gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixC ],
-    [ 'syr2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_real    + la + sy_matrix + trans    + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixC ],
 
-    [ 'syrk',  gen + dtype_real    + la + uplo + ddist + grid_order + trans    + mn + ab + matrixBC ],
-    [ 'syrk',  gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_real    + la + sy_matrix + trans    + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixBC ],
 
     # todo: tbsm fails for nb=8 or 16 with --quick.
     [ 'tbsm',  gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ],
 
-    [ 'trmm',  gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
+    [ 'trmm',  gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
 
-    [ 'trsm',  gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
-    [ 'trsmA', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
-    [ 'trsmB', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ],
+    [ 'trsm',  gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
+    [ 'trsmA', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
+    [ 'trsmB', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
     ]
 
 # LU
 if (opts.lu):
     cmds += [
-    [ 'gesv',         gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ],
-    [ 'gesv_tntpiv',  gen + dtype + la + n + ddist + grid_order ],
-    [ 'gesv_nopiv',   gen + dtype + la + n + ddist + grid_order + nonuniform_nb
+    [ 'gesv',         gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ],
+    [ 'gesv_tntpiv',  gen + dtype + la + n + ge_matrix ],
+    [ 'gesv_nopiv',   gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
     # todo: mn
-    [ 'getrf',        gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ],
-    [ 'getrf_tntpiv', gen + dtype + la + n + ddist + grid_order ],
-    [ 'getrf_nopiv',  gen + dtype + la + n + ddist + grid_order + nonuniform_nb
+    [ 'getrf',        gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ],
+    [ 'getrf_tntpiv', gen + dtype + la + n + ge_matrix ],
+    [ 'getrf_nopiv',  gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getrs',        gen + dtype + la + n + trans + ddist + grid_order + nonuniform_nb + thresh ],
-    [ 'getrs_tntpiv', gen + dtype + la + n + ddist + grid_order ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + ddist + grid_order + nonuniform_nb
+    [ 'getrs',        gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ],
+    [ 'getrs_tntpiv', gen + dtype + la + n + ge_matrix ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getri',    gen + dtype + la + n + ddist + grid_order ],
-    [ 'getriOOP', gen + dtype + la + n + ddist + grid_order ],
+    [ 'getri',    gen + dtype + la + n + ge_matrix ],
+    [ 'getriOOP', gen + dtype + la + n + ge_matrix ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
-    [ 'gesv_mixed',   gen + dtype_double + la + n + ddist + grid_order + nonuniform_nb ],
-    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + ddist + grid_order + nonuniform_nb ],
-    [ 'gesv_rbt', gen + dtype + la + n + ddist + grid_order ],
+    [ 'gesv_mixed',   gen + dtype_double + la + n + ge_matrix + nonuniform_nb ],
+    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + ge_matrix + nonuniform_nb ],
+    [ 'gesv_rbt', gen + dtype + la + n + ge_matrix ],
     ]
 
 # LU banded
@@ -426,14 +431,14 @@ def filter_csv( values, csv ):
 # Cholesky
 if (opts.chol):
     cmds += [
-    [ 'posv',  gen + dtype + la + n + uplo + ddist + grid_order ],
-    [ 'potrf', gen + dtype + la + n + uplo + ddist + grid_order ],
-    [ 'potrs', gen + dtype + la + n + uplo + ddist + grid_order ],
-    [ 'potri', gen + dtype + la + n + uplo + ddist + grid_order ],
+    [ 'posv',  gen + dtype + la + n + he_matrix ],
+    [ 'potrf', gen + dtype + la + n + he_matrix ],
+    [ 'potrs', gen + dtype + la + n + he_matrix ],
+    [ 'potri', gen + dtype + la + n ],
     #[ 'porfs', gen + dtype + la + n + uplo ],
     #[ 'poequ', gen + dtype + la + n ],  # only diagonal elements (no uplo)
-    [ 'posv_mixed', gen + dtype_double + la + n + uplo + ddist + grid_order ],
-    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + uplo + ddist + grid_order ],
+    [ 'posv_mixed', gen + dtype_double + la + n + he_matrix ],
+    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + he_matrix ],
     [ 'trtri', gen + dtype + la + n + uplo + diag ],
     ]
 
@@ -587,9 +592,9 @@ def filter_csv( values, csv ):
 # svd
 if (opts.svd):
     if ('n' in jobu):
-        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' ]]
+        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' + ge_matrix ]]
     if ('v' in jobu):
-        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' ]]
+        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' + ge_matrix ]]
 
     cmds += [
     # todo: mn (wide), nb, jobu, jobvt
@@ -630,31 +635,31 @@ def filter_csv( values, csv ):
 # aux
 if (opts.aux):
     cmds += [
-    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
-    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-
-    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ddist + grid_order        ],
-    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'sycopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'hecopy', gen + dtype + n       + nonuniform_nb + ddist + grid_order + uplo ],
-
-    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
-    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + ddist + grid_order + uplo ],
-
-    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist + grid_order ],
-
-    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order        ],
-    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
-    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + ddist + grid_order + uplo ],
+    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + sy_matrix        ],
+    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + he_matrix        ],
+
+    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ge_matrix        ],
+    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ge_matrix + uplo ],
+    [ 'sycopy', gen + dtype + n       + nonuniform_nb + sy_matrix        ],
+    [ 'hecopy', gen + dtype + n       + nonuniform_nb + he_matrix        ],
+
+    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + sy_matrix        ],
+    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + he_matrix        ],
+
+    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ge_matrix ],
+
+    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + sy_matrix        ],
+    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + he_matrix        ],
     ]
 
 # ------------------------------------------------------------------------------
diff --git a/test/test_svd.cc b/test/test_svd.cc
index 2a1fb276b..7189293a1 100644
--- a/test/test_svd.cc
+++ b/test/test_svd.cc
@@ -8,10 +8,12 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
@@ -35,9 +37,6 @@ void test_svd_work( Params& params, bool run )
     lapack::Job jobvt = params.jobvt();
     int64_t m = params.dim.m();
     int64_t n = params.dim.n();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t ib = params.ib();
     int64_t panel_threads = params.panel_threads();
     int64_t lookahead = params.lookahead();
@@ -51,6 +50,10 @@ void test_svd_work( Params& params, bool run )
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+    // nonuniform nb is not always supported in the reduction to band
+    params.nonuniform_nb.used( false );
+
     params.time();
     params.ref_time();
     params.error2();
@@ -86,6 +89,11 @@ void test_svd_work( Params& params, bool run )
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     if (check && ! ref
         && (jobu == slate::Job::NoVec || jobvt == slate::Job::NoVec)) {
         params.msg() = "job = NoVec requires --ref y to check singular values";
@@ -98,48 +106,6 @@ void test_svd_work( Params& params, bool run )
         {slate::Option::InnerBlocking, ib}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    int64_t min_mn = std::min(m, n);
-
-    // Figure out local size.
-    // matrix A (local input), m-by-n
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data;
-    std::vector<scalar_t> Acpy_data;
-
-    // U  is either m-by-min( m, n ) for some vec, or m-by-m for all vec;
-    // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec.
-    int64_t Um  = m;
-    int64_t Un  = jobu  == slate::Job::AllVec ? m : min_mn;
-    int64_t VTm = jobvt == slate::Job::AllVec ? n : min_mn;
-    int64_t VTn = n;
-
-    // matrix U (local output), U(m, min_mn), singular values of A
-    int64_t mlocU = num_local_rows_cols( Um, nb, myrow, p );
-    int64_t nlocU = num_local_rows_cols( Un, nb, mycol, q );
-    int64_t lldU  = blas::max(1, mlocU); // local leading dimension of U
-    std::vector<scalar_t> U_data(1);
-
-    // matrix VT (local output), VT(min_mn, n)
-    int64_t mlocVT = num_local_rows_cols( VTm, nb, myrow, p );
-    int64_t nlocVT = num_local_rows_cols( VTn, nb, mycol, q );
-    int64_t lldVT  = blas::max(1, mlocVT); // local leading dimension of VT
-    std::vector<scalar_t> VT_data(1);
-
-    // array Sigma (global output), singular values of A
-    std::vector<real_t> Sigma(min_mn);
-
-    slate::Matrix<scalar_t> A; // (m, n);
-    slate::Matrix<scalar_t> U; // (m, min_mn);
-    slate::Matrix<scalar_t> VT; // (min_mn, n);
-    slate::Matrix<scalar_t> Acpy;
-
     bool wantu  = (jobu  == slate::Job::Vec
                    || jobu  == slate::Job::AllVec
                    || jobu  == slate::Job::SomeVec);
@@ -147,45 +113,28 @@ void test_svd_work( Params& params, bool run )
                    || jobvt == slate::Job::AllVec
                    || jobvt == slate::Job::SomeVec);
 
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    int64_t min_mn = std::min(m, n);
 
-        Acpy = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        Acpy.insertLocalTiles(origin_target);
+    // U  is either m-by-min( m, n ) for some vec, or m-by-m for all vec;
+    // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec.
+    int64_t Um  = wantu  ? m : 0;
+    int64_t Un  = wantu  ? (jobu  == slate::Job::AllVec ? m : min_mn) : 0;
+    int64_t VTm = wantvt ? (jobvt == slate::Job::AllVec ? n : min_mn) : 0;
+    int64_t VTn = wantvt ? n : 0;
 
-        if (wantu) {
-            U = slate::Matrix<scalar_t>( Um, Un, nb, p, q, MPI_COMM_WORLD );
-            U.insertLocalTiles(origin_target);
-        }
-        if (wantvt) {
-            VT = slate::Matrix<scalar_t>( VTm, VTn, nb, p, q, MPI_COMM_WORLD );
-            VT.insertLocalTiles(origin_target);
-        }
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A_data.resize( lldA * nlocA );
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &A_data[0],  lldA,  nb, p, q, MPI_COMM_WORLD);
+    // array Sigma (global output), singular values of A
+    std::vector<real_t> Sigma(min_mn);
 
-        Acpy_data.resize( lldA * nlocA );
-        Acpy = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &Acpy_data[0],  lldA,  nb, p, q, MPI_COMM_WORLD);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, m, n, params );
+    auto U_alloc = allocate_test_Matrix<scalar_t>( false, true, Um, Un, params );
+    auto VT_alloc = allocate_test_Matrix<scalar_t>( false, true, VTm, VTn, params );
+    // TODO Acpy isn't always needed
+    auto Acpy_alloc = allocate_test_Matrix<scalar_t>( false, true, m, n, params );
 
-        if (wantu) {
-            U_data.resize(lldU*nlocU);
-            U = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    Um, Un, &U_data[0], lldU, nb, p, q, MPI_COMM_WORLD );
-        }
-        if (wantvt) {
-            VT_data.resize(lldVT*nlocVT);
-            VT = slate::Matrix<scalar_t>::fromScaLAPACK(
-                     VTm, VTn, &VT_data[0], lldVT, nb, p, q, MPI_COMM_WORLD );
-        }
-    }
+    auto& A         = A_alloc.A;
+    auto& U         = U_alloc.A;
+    auto& VT        = VT_alloc.A;
+    auto& Acpy      = Acpy_alloc.A;
 
     if (verbose >= 1) {
         printf( "%% A   %6lld-by-%6lld\n", llong(   A.m() ), llong(   A.n() ) );
@@ -201,15 +150,10 @@ void test_svd_work( Params& params, bool run )
     slate::generate_matrix( params.matrix, A);
     print_matrix( "A",  A, params );
 
-    slate::Matrix<scalar_t> Aref;
     std::vector<real_t> Sigma_ref;
-    std::vector<scalar_t> Aref_data;
     if (check || ref) {
         Sigma_ref.resize( min_mn );
-        Aref_data.resize( lldA * nlocA );
-        Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-        slate::copy( A, Aref );
+        slate::copy( A, A_alloc.Aref );
         slate::copy( A, Acpy );
     }
 
@@ -272,8 +216,8 @@ void test_svd_work( Params& params, bool run )
             Rm = blas::max( Rm, m );
         if (jobvt == slate::Job::AllVec)
             Rm = blas::max( Rm, n );
-        slate::Matrix<scalar_t> R( Rm, Rm, nb, p, q, MPI_COMM_WORLD );
-        R.insertLocalTiles();
+        auto R_alloc = allocate_test_Matrix<scalar_t>( false, true, Rm, Rm, params );
+        auto R = R_alloc.A;
 
         if (wantu) {
             //==================================================
@@ -344,42 +288,21 @@ void test_svd_work( Params& params, bool run )
         #ifdef SLATE_HAVE_SCALAPACK
             // Run reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            blas_int A_desc[9];
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            blas_int U_desc[9];
-            scalapack_descinit(U_desc, m, min_mn, nb, nb, 0, 0, ictxt, mlocU, &info);
-            slate_assert(info == 0);
-
-            blas_int VT_desc[9];
-            scalapack_descinit(VT_desc, min_mn, n, nb, nb, 0, 0, ictxt, mlocVT, &info);
-            slate_assert(info == 0);
-
-            // Allocate U and VT if not already allocated.
-            // If origin=scalapack, just overwrite SLATE's U and VT.
-            if (wantu) {
-                U_data.resize( lldU * nlocU );
-            }
-            if (wantvt) {
-                VT_data.resize( lldVT * nlocVT );
+            blas_int ictxt, A_desc[9], U_desc[9], VT_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            U_alloc.ScaLAPACK_descriptor( ictxt, U_desc );
+            VT_alloc.ScaLAPACK_descriptor( ictxt, VT_desc );
+
+            auto& Aref_data = A_alloc.Aref_data;
+            auto& U_data = U_alloc.A_data;
+            auto& VT_data = VT_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                U_data.resize( U_alloc.lld * U_alloc.nloc );
+                VT_data.resize( VT_alloc.lld * VT_alloc.nloc );
             }
 
             // ScaLAPACK uses job = N and V (same as S);
@@ -444,7 +367,7 @@ void test_svd_work( Params& params, bool run )
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }

From caffee312ad512b9f8a38d16291c8d43d1254d2a Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 27 Dec 2023 15:57:56 -0500
Subject: [PATCH 18/33] FIXUP 9c933fcf5

---
 test/run_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index dcf5d7324..f08073217 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -409,8 +409,8 @@ def filter_csv( values, csv ):
     [ 'getrs_nopiv',  gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getri',    gen + dtype + la + n + ge_matrix ],
-    [ 'getriOOP', gen + dtype + la + n + ge_matrix ],
+    [ 'getri',    gen + dtype + la + n ],
+    [ 'getriOOP', gen + dtype + la + n ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
     [ 'gesv_mixed',   gen + dtype_double + la + n + ge_matrix + nonuniform_nb ],

From 60421d09ed0d0606bece457fab253cf01922e73a Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 20 Dec 2023 18:02:57 -0500
Subject: [PATCH 19/33] Add options to norm testers

---
 test/matrix_utils.cc | 103 +++++++++++---
 test/matrix_utils.hh |  15 +-
 test/run_tests.py    |  12 +-
 test/test_genorm.cc  | 272 +++++++++++++++++-------------------
 test/test_henorm.cc  | 302 +++++++++++++++++++--------------------
 test/test_scale.cc   |   1 -
 test/test_synorm.cc  | 302 +++++++++++++++++++--------------------
 test/test_trnorm.cc  | 326 +++++++++++++++++++------------------------
 test/test_utils.hh   |  70 +++++++++-
 9 files changed, 726 insertions(+), 677 deletions(-)

diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
index df9c1022a..0de0e57e9 100644
--- a/test/matrix_utils.cc
+++ b/test/matrix_utils.cc
@@ -31,18 +31,21 @@ static TestMatrix<matrix_type> allocate_test_shared(
     TestMatrix<matrix_type> matrix( m, n, nb, p, q, grid_order );
 
     // Functions for nonuniform tile sizes or row device distribution
-    nb_func_t tileMb, tileNb;
+    nb_func_t tileNb;
     if (nonuniform_nb) {
         tileNb = [nb](int64_t j) {
             // for non-uniform tile size
             return (j % 2 != 0 ? nb*2 : nb);
         };
-        tileMb = tileNb;
     }
     else {
-        tileNb = slate::func::uniform_blocksize( n, nb );
-        tileMb = (m == n) ? tileNb : slate::func::uniform_blocksize( m, nb );
+        // NB. we let BaseMatrix truncate the final tile length
+        // TrapezoidMatrix only takes 1 function for both dimensions (of different sizes)
+        tileNb = [nb](int64_t j) {
+            return nb;
+        };
     }
+
     auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
     int num_devices_ = blas::get_device_count();
     auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
@@ -53,7 +56,7 @@ static TestMatrix<matrix_type> allocate_test_shared(
         // SLATE allocates CPU or GPU tiles.
         slate::Target origin_target = origin2target( origin );
         if (nonuniform_nb || dev_dist == slate::Dist::Col) {
-            matrix.A = construct_irregular( tileMb, tileNb, tileRank, tileDevice );
+            matrix.A = construct_irregular( tileNb, tileRank, tileDevice );
         }
         else {
             matrix.A = construct_regular( nb, grid_order, p, q );
@@ -72,7 +75,7 @@ static TestMatrix<matrix_type> allocate_test_shared(
     // Setup reference matrix
     if (ref_matrix) {
         if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = construct_irregular( tileMb, tileNb, tileRank, tileDevice );
+            matrix.Aref = construct_irregular( tileNb, tileRank, tileDevice );
             matrix.Aref.insertLocalTiles( slate::Target::Host );
         }
         else {
@@ -108,10 +111,10 @@ template <typename scalar_t>
 TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params)
 {
-    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+    auto construct_irregular = [&] (nb_func_t tileNb,
                                     dist_func_t tileRank, dist_func_t tileDevice)
     {
-        return slate::Matrix<scalar_t>( m, n, tileMb, tileNb,
+        return slate::Matrix<scalar_t>( m, n, tileNb, tileNb,
                                         tileRank, tileDevice, MPI_COMM_WORLD );
     };
     auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
@@ -159,7 +162,7 @@ TestMatrix<matrix_type> allocate_test_HeSyMatrix(
 
     slate::Uplo uplo = params.uplo();
 
-    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+    auto construct_irregular = [&] (nb_func_t tileNb,
                                     dist_func_t tileRank, dist_func_t tileDevice)
     {
         return matrix_type( uplo, n, tileNb,
@@ -192,9 +195,6 @@ TestMatrix<matrix_type> allocate_test_HeSyMatrix(
 ///     If params.nonuniform_nb(), whether to also allocate the reference matrix
 ///     with non-uniform tiles.
 ///
-/// @param[in] m
-///     The number of rows
-///
 /// @param[in] n
 ///     The number of columns
 ///
@@ -238,9 +238,6 @@ TestMatrix<slate::HermitianMatrix<std::complex<double>>> allocate_test_Hermitian
 ///     If params.nonuniform_nb(), whether to also allocate the reference matrix
 ///     with non-uniform tiles.
 ///
-/// @param[in] m
-///     The number of rows
-///
 /// @param[in] n
 ///     The number of columns
 ///
@@ -275,7 +272,7 @@ TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_Symmetric
 
 
 //------------------------------------------------------------------------------
-/// Allocates a SymmetricMatrix<scalar_t> and optionally a reference
+/// Allocates a TriangularMatrix<scalar_t> and optionally a reference
 /// version for testing.
 ///
 /// @param[in] ref_matrix
@@ -285,9 +282,6 @@ TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_Symmetric
 ///     If params.nonuniform_nb(), whether to also allocate the reference matrix
 ///     with non-uniform tiles.
 ///
-/// @param[in] m
-///     The number of rows
-///
 /// @param[in] n
 ///     The number of columns
 ///
@@ -302,7 +296,7 @@ TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
     slate::Uplo uplo = params.uplo();
     slate::Diag diag = params.diag();
 
-    auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb,
+    auto construct_irregular = [&] (nb_func_t tileNb,
                                     dist_func_t tileRank, dist_func_t tileDevice)
     {
         return slate::TriangularMatrix<scalar_t>( uplo, diag, n, tileNb,
@@ -343,3 +337,72 @@ TestMatrix<slate::TriangularMatrix<std::complex<float>>> allocate_test_Triangula
 template
 TestMatrix<slate::TriangularMatrix<std::complex<double>>> allocate_test_TriangularMatrix<std::complex<double>>(
     bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+//------------------------------------------------------------------------------
+/// Allocates a TrapezoidMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::TrapezoidMatrix<scalar_t>> allocate_test_TrapezoidMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    slate::Diag diag = params.diag();
+
+    auto construct_irregular = [&] (nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::TrapezoidMatrix<scalar_t>( uplo, diag, m, n, tileNb,
+                                                  tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TrapezoidMatrix<scalar_t>( uplo, diag, m, n, nb,
+                                                  grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TrapezoidMatrix<scalar_t>::fromScaLAPACK(
+                                            uplo, diag, m, n, data, lld, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::TrapezoidMatrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, m, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::TrapezoidMatrix<float>> allocate_test_TrapezoidMatrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<double>> allocate_test_TrapezoidMatrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<std::complex<float>>> allocate_test_TrapezoidMatrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<std::complex<double>>> allocate_test_TrapezoidMatrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index f92576c84..247a8ef12 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -288,7 +288,7 @@ inline void mark_params_for_test_SymmetricMatrix(Params& params)
 }
 
 //------------------------------------------------------------------------------
-/// Marks the paramters used by allocate_test_HermitianMatrix
+/// Marks the paramters used by allocate_test_TriangularMatrix
 inline void mark_params_for_test_TriangularMatrix(Params& params)
 {
     params.uplo();
@@ -296,6 +296,15 @@ inline void mark_params_for_test_TriangularMatrix(Params& params)
     mark_params_for_test_Matrix( params );
 }
 
+// -----------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_TrapezoidMatrix
+inline void mark_params_for_test_TrapezoidMatrix(Params& params)
+{
+    params.uplo();
+    params.diag();
+    mark_params_for_test_Matrix( params );
+}
+
 template <typename scalar_t>
 TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
     bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
@@ -312,4 +321,8 @@ template <typename scalar_t>
 TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
     bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
 
+template <typename scalar_t>
+TestMatrix<slate::TrapezoidMatrix<scalar_t>> allocate_test_TrapezoidMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/run_tests.py b/test/run_tests.py
index f08073217..d0ce7aace 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -298,12 +298,12 @@
 origin = ' --origin ' + opts.origin if (opts.origin) else ''
 target = ' --target ' + opts.target if (opts.target) else ''
 la     = ' --lookahead ' + opts.lookahead if (opts.lookahead) else ''
-ddist  = ' --dev-dist  ' + opts.dev_dist  if (opts.dev_dist)  else ''
+ddist  = ' --dev-dist ' + opts.dev_dist  if (opts.dev_dist)  else ''
 nb     = ' --nb '     + opts.nb     if (opts.nb)     else ''
 nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else ''
 nt     = ' --nt '     + opts.nt     if (opts.nt)     else ''
 grid   = ' --grid '   + opts.grid   if (opts.grid)   else ''
-grid_order = ' --grid-order  ' + opts.grid_order  if (opts.grid_order)  else ''
+grid_order = ' --grid-order ' + opts.grid_order  if (opts.grid_order)  else ''
 repeat = ' --repeat ' + opts.repeat if (opts.repeat) else ''
 thresh = ' --thresh ' + opts.thresh if (opts.thresh) else ''
 matrix  = ' --matrix  ' + opts.matrix  if (opts.matrix)  else ''
@@ -607,10 +607,10 @@ def filter_csv( values, csv ):
 # norms
 if (opts.norms):
     cmds += [
-    [ 'genorm', gen + dtype + mn + norm ],
-    [ 'henorm', gen + dtype + n  + norm + uplo ],
-    [ 'synorm', gen + dtype + n  + norm + uplo ],
-    [ 'trnorm', gen + dtype + mn + norm + uplo + diag ],
+    [ 'genorm', gen + dtype + mn + norm + nonuniform_nb + ge_matrix ],
+    [ 'henorm', gen + dtype + n  + norm + nonuniform_nb + he_matrix ],
+    [ 'synorm', gen + dtype + n  + norm + nonuniform_nb + sy_matrix ],
+    [ 'trnorm', gen + dtype + mn + norm + nonuniform_nb + tr_matrix ],
 
     # Banded
     [ 'gbnorm', gen + dtype + mn  + kl + ku + norm ],
diff --git a/test/test_genorm.cc b/test/test_genorm.cc
index 5437d9429..31f7683df 100644
--- a/test/test_genorm.cc
+++ b/test/test_genorm.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -39,12 +41,17 @@ void test_genorm_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -52,41 +59,26 @@ void test_genorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( ref_copy, false, m, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (check || ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-    }
+    slate::generate_matrix(params.matrix, A);
 
-    slate::Matrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix(params.matrix, A);
-
     std::vector<real_t> values;
     if (scope == slate::NormScope::Columns) {
         values.resize(A.n());
@@ -136,33 +128,21 @@ void test_genorm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            slate_assert(info == 0);
-
-            if (origin != slate::Origin::ScaLAPACK) {
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
                 copy(A, &A_data[0], A_desc);
             }
 
             // allocate work space
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
+            std::vector<real_t> worklange(std::max(A_alloc.mloc, A_alloc.nloc));
 
             // (Sca)LAPACK norms don't support trans; map One <=> Inf norm.
             slate::Norm op_norm = norm;
@@ -199,10 +179,6 @@ void test_genorm_work(Params& params, bool run)
             }
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lange(
-            //    op_norm,
-            //    m, n, &A_data[0], lldA);
-
             if (scope == slate::NormScope::Matrix) {
                 // difference between norms
                 error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
@@ -216,7 +192,7 @@ void test_genorm_work(Params& params, bool run)
                     error /= sqrt(m*n);
                 }
 
-                if (verbose && mpi_rank == 0) {
+                if (verbose && A.mpiRank() == 0) {
                     printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                            A_norm, A_norm_ref, A_norm_ref - A_norm, error);
                 }
@@ -238,107 +214,113 @@ void test_genorm_work(Params& params, bool run)
 
             //---------- extended tests
             if (extended && scope == slate::NormScope::Matrix) {
-                // seed all MPI processes the same
-                srand(1234);
-
-                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-                // up to 64 tiles total.
-                // Indices may be out-of-bounds if mt or nt is small, so check in loops.
-                int64_t mt = A.mt();
-                int64_t nt = A.nt();
-                std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
-                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-                for (size_t k = 0; k < 4; ++k) {
-                    i_indices.insert(rand() % mt);
-                    j_indices.insert(rand() % nt);
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
                 }
-                for (auto j : j_indices) {
-                    if (j < 0 || j >= nt)
-                        continue;
-                    int64_t jb = std::min(n - j*nb, nb);
-                    slate_assert(jb == A.tileNb(j));
-
-                    for (auto i : i_indices) {
-                        if (i < 0 || i >= mt)
+                else {
+
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if mt or nt is small, so check in loops.
+                    int64_t mt = A.mt();
+                    int64_t nt = A.nt();
+                    std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        i_indices.insert(rand() % mt);
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
-                        int64_t ib = std::min(m - i*nb, nb);
-                        slate_assert(ib == A.tileMb(i));
-
-                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                        // up to 25 entries per tile.
-                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                        // todo: complex peak
-                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                        if (rand() < RAND_MAX / 2)
-                            peak *= -1;
-                        if (rand() < RAND_MAX / 20)
-                            peak = nan("");
-                        scalar_t save = 0;
-
-                        for (auto jj : jj_indices) {
-                            if (jj < 0 || jj >= jb)
-                                continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                            for (auto ii : ii_indices) {
-                                if (ii < 0 || ii >= ib) {
+                        for (auto i : i_indices) {
+                            if (i < 0 || i >= mt)
+                                continue;
+                            int64_t ib = std::min(m - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
                                     continue;
-                                }
 
-                                int64_t ilocal = int(i / p)*nb + ii;
-                                int64_t jlocal = int(j / q)*nb + jj;
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    save = T(ii, jj);
-                                    T.at(ii, jj) = peak;
-                                    A_data[ ilocal + jlocal*lldA ] = peak;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                                }
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib) {
+                                        continue;
+                                    }
 
-                                A_norm = slate::norm(norm, A, opts);
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the genorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
 
-                                A_norm_ref = scalapack_plange(
-                                                 norm2str(norm), m, n,
-                                                 &A_data[0], 1, 1, A_desc,
-                                                 &worklange[0]);
+                                    A_norm = slate::norm(norm, A, opts);
 
-                                // difference between norms
-                                error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                                if (norm == slate::Norm::One) {
-                                    error /= sqrt(m);
-                                }
-                                else if (norm == slate::Norm::Inf) {
-                                    error /= sqrt(n);
-                                }
-                                else if (norm == slate::Norm::Fro) {
-                                    error /= sqrt(m*n);
-                                }
+                                    A_norm_ref = scalapack_plange(
+                                                     norm2str(norm), m, n,
+                                                     &A_data[0], 1, 1, A_desc,
+                                                     &worklange[0]);
 
-                                if (mpi_rank == 0) {
-                                    // if peak is nan, expect A_norm to be nan.
-                                    bool okay = (std::isnan(real(peak))
-                                                 ? std::isnan(A_norm)
-                                                 : error <= tol);
-                                    params.okay() = params.okay() && okay;
-                                    if (verbose || ! okay) {
-                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                               real(peak), A_norm, A_norm_ref, error,
-                                               (okay ? "pass" : "failed"));
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One) {
+                                        error /= sqrt(m);
+                                    }
+                                    else if (norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(m*n);
                                     }
-                                }
 
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    T.at(ii, jj) = save;
-                                    A_data[ ilocal + jlocal*lldA ] = save;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
+                                    }
+
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the genorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
                         }
@@ -351,7 +333,7 @@ void test_genorm_work(Params& params, bool run)
             SLATE_UNUSED( A_norm );
             SLATE_UNUSED( extended );
             SLATE_UNUSED( verbose );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_henorm.cc b/test/test_henorm.cc
index 08b5fd4bc..d18a38792 100644
--- a/test/test_henorm.cc
+++ b/test/test_henorm.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -36,12 +38,17 @@ void test_henorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -51,44 +58,26 @@ void test_henorm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( ref_copy, false, n, params );
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::HermitianMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
+    slate::generate_matrix(params.matrix, A);
 
-    slate::HermitianMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::HermitianMatrix<scalar_t>(uplo, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix( params.matrix, A );
-
     print_matrix("A", A, params);
 
     if (trace) slate::trace::Trace::on();
@@ -110,38 +99,28 @@ void test_henorm_work(Params& params, bool run)
     params.time() = time;
 
     #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
+        // comparison with reference routine from ScaLAPACK
 
         // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
+        blas_int ictxt, A_desc[9];
+        A_alloc.create_ScaLAPACK_context( &ictxt );
+        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+            A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+            copy(A, &A_data[0], A_desc);
         }
 
         if (check || ref) {
             // comparison with reference routine from ScaLAPACK
 
             // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
+            int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
                                       scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
+            int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
             std::vector<real_t> worklanhe( lwork );
 
             //==================================================
@@ -153,10 +132,6 @@ void test_henorm_work(Params& params, bool run)
                                     n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lanhe(
-            //    norm, A.uplo(),
-            //    n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
@@ -166,7 +141,7 @@ void test_henorm_work(Params& params, bool run)
                 error /= n;  // = sqrt( n*n );
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -188,119 +163,124 @@ void test_henorm_work(Params& params, bool run)
 
         //---------- extended tests
         if (extended) {
-            // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
-            std::vector<real_t> worklanhe(lwork);
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if nt is small, so check in loops.
-            int64_t nt = A.nt();
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                j_indices.insert(rand() % nt);
+            if (grid_order != slate::GridOrder::Col) {
+                printf("WARNING: cannot do extended tests with row-major grid\n");
             }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : j_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+            else {
+                // allocate work space
+                int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
+                                          scalapack_ilcm( p, q ) / p );
+                int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
+                std::vector<real_t> worklanhe(lwork);
+
+                // seed all MPI processes the same
+                srand(1234);
+
+                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                // up to 64 tiles total.
+                // Indices may be out-of-bounds if nt is small, so check in loops.
+                int64_t nt = A.nt();
+                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                for (size_t k = 0; k < 4; ++k) {
+                    j_indices.insert(rand() % nt);
+                }
+                for (auto j : j_indices) {
+                    if (j < 0 || j >= nt)
                         continue;
-                    int64_t ib = std::min(n - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
-                            continue;
+                    int64_t jb = std::min(n - j*nb, nb);
+                    slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                    for (auto i : j_indices) {
+                        // lower requires i >= j
+                        // upper requires i <= j
+                        if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                            continue;
+                        int64_t ib = std::min(n - i*nb, nb);
+                        slate_assert(ib == A.tileMb(i));
+
+                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                        // up to 25 entries per tile.
+                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                        // todo: complex peak
+                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                        if (rand() < RAND_MAX / 2)
+                            peak *= -1;
+                        if (rand() < RAND_MAX / 20)
+                            peak = nan("");
+                        scalar_t save = 0;
+
+                        for (auto jj : jj_indices) {
+                            if (jj < 0 || jj >= jb)
                                 continue;
-                            }
 
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
+                            for (auto ii : ii_indices) {
+                                if (ii < 0 || ii >= ib
+                                    || (i == j && (uplo == slate::Uplo::Lower
+                                                   ? ii < jj
+                                                   : ii > jj))) {
+                                    continue;
+                                }
+
+                                int64_t ilocal = int(i / p)*nb + ii;
+                                int64_t jlocal = int(j / q)*nb + jj;
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    save = T(ii, jj);
+                                    T.at(ii, jj) = peak;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
 
-                            A_norm = slate::norm(norm, A, opts);
+                                A_norm = slate::norm(norm, A, opts);
 
-                            real_t A_norm_ref = scalapack_planhe(
-                                                    norm2str(norm), uplo2str(A.uplo()),
-                                                    n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
+                                real_t A_norm_ref = scalapack_planhe(
+                                                        norm2str(norm), uplo2str(A.uplo()),
+                                                        n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
 
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(n*n);
-                            }
+                                // difference between norms
+                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                    error /= sqrt(n);
+                                }
+                                else if (norm == slate::Norm::Fro) {
+                                    error /= sqrt(n*n);
+                                }
 
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan.
-                                bool okay = (std::isnan(real(peak))
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real(peak), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                                // Allow for difference, except max norm in real should be exact.
+                                real_t eps = std::numeric_limits<real_t>::epsilon();
+                                real_t tol;
+                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
+                                    tol = 0;
+                                else
+                                    tol = 10*eps;
+
+                                if (A.mpiRank() == 0) {
+                                    // if peak is nan, expect A_norm to be nan.
+                                    bool okay = (std::isnan(real(peak))
+                                                 ? std::isnan(A_norm)
+                                                 : error <= tol);
+                                    params.okay() = params.okay() && okay;
+                                    if (verbose || ! okay) {
+                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                               real(peak), A_norm, A_norm_ref, error,
+                                               (okay ? "pass" : "failed"));
+                                    }
                                 }
-                            }
 
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    T.at(ii, jj) = save;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
                             }
                         }
                     }
@@ -315,7 +295,7 @@ void test_henorm_work(Params& params, bool run)
         SLATE_UNUSED( ref );
         SLATE_UNUSED( extended );
         SLATE_UNUSED( verbose );
-        if ((check || ref) && mpi_rank == 0)
+        if ((check || ref) && A.mpiRank() == 0)
             printf( "ScaLAPACK not available\n" );
     #endif
 }
diff --git a/test/test_scale.cc b/test/test_scale.cc
index d6a71c5af..0d05b056d 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
diff --git a/test/test_synorm.cc b/test/test_synorm.cc
index 0976e59c5..1145006fe 100644
--- a/test/test_synorm.cc
+++ b/test/test_synorm.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -36,12 +38,17 @@ void test_synorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -49,44 +56,26 @@ void test_synorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+    auto A_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref_copy, false, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
-
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::SymmetricMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
+    slate::generate_matrix(params.matrix, A);
 
-    slate::SymmetricMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::SymmetricMatrix<scalar_t>(uplo, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix(params.matrix, A);
-
     print_matrix("A", A, params);
 
     if (trace) slate::trace::Trace::on();
@@ -108,38 +97,28 @@ void test_synorm_work(Params& params, bool run)
     params.time() = time;
 
     #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
+        // comparison with reference routine from ScaLAPACK
 
         // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
+        blas_int ictxt, A_desc[9];
+        A_alloc.create_ScaLAPACK_context( &ictxt );
+        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+            A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+            copy(A, &A_data[0], A_desc);
         }
 
         if (check || ref) {
             // comparison with reference routine from ScaLAPACK
 
             // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
+            int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
                                       scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
+            int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
             std::vector<real_t> worklansy(lwork);
 
             //==================================================
@@ -151,10 +130,6 @@ void test_synorm_work(Params& params, bool run)
                                     n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lansy(
-            //    norm, A.uplo(),
-            //    n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
@@ -164,7 +139,7 @@ void test_synorm_work(Params& params, bool run)
                 error /= n;  // = sqrt( n*n );
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -186,119 +161,124 @@ void test_synorm_work(Params& params, bool run)
 
         //---------- extended tests
         if (extended) {
-            // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
-            std::vector<real_t> worklansy(lwork);
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if nt is small, so check in loops.
-            int64_t nt = A.nt();
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                j_indices.insert(rand() % nt);
+            if (grid_order != slate::GridOrder::Col) {
+                printf("WARNING: cannot do extended tests with row-major grid\n");
             }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : j_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+            else {
+                // allocate work space
+                int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
+                                          scalapack_ilcm( p, q ) / p );
+                int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
+                std::vector<real_t> worklansy(lwork);
+
+                // seed all MPI processes the same
+                srand(1234);
+
+                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                // up to 64 tiles total.
+                // Indices may be out-of-bounds if nt is small, so check in loops.
+                int64_t nt = A.nt();
+                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                for (size_t k = 0; k < 4; ++k) {
+                    j_indices.insert(rand() % nt);
+                }
+                for (auto j : j_indices) {
+                    if (j < 0 || j >= nt)
                         continue;
-                    int64_t ib = std::min(n - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
-                            continue;
+                    int64_t jb = std::min(n - j*nb, nb);
+                    slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                    for (auto i : j_indices) {
+                        // lower requires i >= j
+                        // upper requires i <= j
+                        if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                            continue;
+                        int64_t ib = std::min(n - i*nb, nb);
+                        slate_assert(ib == A.tileMb(i));
+
+                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                        // up to 25 entries per tile.
+                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                        // todo: complex peak
+                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                        if (rand() < RAND_MAX / 2)
+                            peak *= -1;
+                        if (rand() < RAND_MAX / 20)
+                            peak = nan("");
+                        scalar_t save = 0;
+
+                        for (auto jj : jj_indices) {
+                            if (jj < 0 || jj >= jb)
                                 continue;
-                            }
 
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
+                            for (auto ii : ii_indices) {
+                                if (ii < 0 || ii >= ib
+                                    || (i == j && (uplo == slate::Uplo::Lower
+                                                   ? ii < jj
+                                                   : ii > jj))) {
+                                    continue;
+                                }
 
-                            A_norm = slate::norm(norm, A, opts);
+                                int64_t ilocal = int(i / p)*nb + ii;
+                                int64_t jlocal = int(j / q)*nb + jj;
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    save = T(ii, jj);
+                                    T.at(ii, jj) = peak;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
 
-                            real_t A_norm_ref = scalapack_plansy(
-                                                    norm2str(norm), uplo2str(A.uplo()),
-                                                    n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
+                                A_norm = slate::norm(norm, A, opts);
 
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(n*n);
-                            }
+                                real_t A_norm_ref = scalapack_plansy(
+                                                        norm2str(norm), uplo2str(A.uplo()),
+                                                        n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
 
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan.
-                                bool okay = (std::isnan(real(peak))
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real( peak ), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                                // difference between norms
+                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                    error /= sqrt(n);
+                                }
+                                else if (norm == slate::Norm::Fro) {
+                                    error /= sqrt(n*n);
+                                }
+
+                                // Allow for difference, except max norm in real should be exact.
+                                real_t eps = std::numeric_limits<real_t>::epsilon();
+                                real_t tol;
+                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
+                                    tol = 0;
+                                else
+                                    tol = 10*eps;
+
+                                if (A.mpiRank() == 0) {
+                                    // if peak is nan, expect A_norm to be nan.
+                                    bool okay = (std::isnan(real(peak))
+                                                 ? std::isnan(A_norm)
+                                                 : error <= tol);
+                                    params.okay() = params.okay() && okay;
+                                    if (verbose || ! okay) {
+                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                               real( peak ), A_norm, A_norm_ref, error,
+                                               (okay ? "pass" : "failed"));
+                                    }
                                 }
-                            }
 
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    T.at(ii, jj) = save;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
                             }
                         }
                     }
@@ -313,7 +293,7 @@ void test_synorm_work(Params& params, bool run)
         SLATE_UNUSED( ref );
         SLATE_UNUSED( extended );
         SLATE_UNUSED( verbose );
-        if ((check || ref) && mpi_rank == 0)
+        if ((check || ref) && A.mpiRank() == 0)
             printf( "ScaLAPACK not available\n" );
     #endif
 }
diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc
index 8262e6ac9..4cd1a4cbc 100644
--- a/test/test_trnorm.cc
+++ b/test/test_trnorm.cc
@@ -7,10 +7,12 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,12 +40,17 @@ void test_trnorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_TrapezoidMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -51,55 +58,26 @@ void test_trnorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // upper requires m <= n,
-    // lower requires m >= n
-    if ((uplo == slate::Uplo::Lower && m < n) ||
-        (uplo == slate::Uplo::Upper && m > n)) {
-        char buf[255];
-        snprintf( buf, sizeof( buf ), "skipping invalid size: %s, %lld-by-%lld",
-                  uplo2str( uplo ), llong( m ), llong( n ) );
-        params.msg() = buf;
-        return;
-    }
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto A_alloc = allocate_test_TrapezoidMatrix<scalar_t>( ref_copy, false, m, n, params );
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::TrapezoidMatrix<scalar_t> A0(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD);
+    slate::generate_matrix(params.matrix, A);
 
-    slate::TrapezoidMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::TrapezoidMatrix<scalar_t>(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::TrapezoidMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix( params.matrix, A );
-
     print_matrix("A", A, params);
 
     if (trace)
@@ -124,38 +102,27 @@ void test_trnorm_work(Params& params, bool run)
     params.time() = time;
 
     #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
+        // comparison with reference routine from ScaLAPACK
 
         // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        if (info != 0)
-            printf( "scalapack_descinit info %lld\n", llong( info ) );
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
+        blas_int ictxt, A_desc[9];
+        A_alloc.create_ScaLAPACK_context( &ictxt );
+        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+            A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+            copy(A, &A_data[0], A_desc);
         }
 
+        // TODO move the above into this if statement
         if (check || ref) {
             // comparison with reference routine from ScaLAPACK
 
             // allocate work space
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
+            std::vector<real_t> worklantr(std::max(A_alloc.mloc, A_alloc.nloc));
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -166,10 +133,6 @@ void test_trnorm_work(Params& params, bool run)
                                     m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lantr(
-            //    norm, A.uplo(), diag,
-            //    m, n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One) {
@@ -182,7 +145,7 @@ void test_trnorm_work(Params& params, bool run)
                 error /= sqrt(m*n);
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -204,125 +167,130 @@ void test_trnorm_work(Params& params, bool run)
 
         //---------- extended tests
         if (extended) {
-            // allocate work space
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if mt or nt is small, so check in loops.
-            int64_t mt = A.mt();
-            int64_t nt = A.nt();
-            std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                i_indices.insert(rand() % mt);
-                j_indices.insert(rand() % nt);
+            if (grid_order != slate::GridOrder::Col) {
+                printf("WARNING: cannot do extended tests with row-major grid\n");
             }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : i_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+            else {
+                // allocate work space
+                std::vector<real_t> worklantr(std::max(A_alloc.mloc, A_alloc.nloc));
+
+                // seed all MPI processes the same
+                srand(1234);
+
+                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                // up to 64 tiles total.
+                // Indices may be out-of-bounds if mt or nt is small, so check in loops.
+                int64_t mt = A.mt();
+                int64_t nt = A.nt();
+                std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
+                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                for (size_t k = 0; k < 4; ++k) {
+                    i_indices.insert(rand() % mt);
+                    j_indices.insert(rand() % nt);
+                }
+                for (auto j : j_indices) {
+                    if (j < 0 || j >= nt)
                         continue;
-                    int64_t ib = std::min(m - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
-                            continue;
+                    int64_t jb = std::min(n - j*nb, nb);
+                    slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                    for (auto i : i_indices) {
+                        // lower requires i >= j
+                        // upper requires i <= j
+                        if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                            continue;
+                        int64_t ib = std::min(m - i*nb, nb);
+                        slate_assert(ib == A.tileMb(i));
+
+                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                        // up to 25 entries per tile.
+                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                        // todo: complex peak
+                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                        if (rand() < RAND_MAX / 2)
+                            peak *= -1;
+                        if (rand() < RAND_MAX / 20)
+                            peak = nan("");
+                        scalar_t save = 0;
+
+                        for (auto jj : jj_indices) {
+                            if (jj < 0 || jj >= jb)
                                 continue;
-                            }
 
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                slate_assert(A_data[ ilocal + jlocal*lldA ] == save);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
+                            for (auto ii : ii_indices) {
+                                if (ii < 0 || ii >= ib
+                                    || (i == j && (uplo == slate::Uplo::Lower
+                                                   ? ii < jj
+                                                   : ii > jj))) {
+                                    continue;
+                                }
 
-                            A_norm = slate::norm(norm, A, opts);
+                                int64_t ilocal = int(i / p)*nb + ii;
+                                int64_t jlocal = int(j / q)*nb + jj;
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    save = T(ii, jj);
+                                    slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save);
+                                    T.at(ii, jj) = peak;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
 
-                            real_t A_norm_ref = scalapack_plantr(
-                                                    norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
-                                                    m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
+                                A_norm = slate::norm(norm, A, opts);
 
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One) {
-                                error /= sqrt(m);
-                            }
-                            else if (norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(m*n);
-                            }
+                                real_t A_norm_ref = scalapack_plantr(
+                                                        norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
+                                                        m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
 
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan,
-                                // except in Unit case with i == j and ii == jj,
-                                // where peak shouldn't affect A_norm.
-                                bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real(peak), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                                // difference between norms
+                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                if (norm == slate::Norm::One) {
+                                    error /= sqrt(m);
+                                }
+                                else if (norm == slate::Norm::Inf) {
+                                    error /= sqrt(n);
+                                }
+                                else if (norm == slate::Norm::Fro) {
+                                    error /= sqrt(m*n);
+                                }
+
+                                // Allow for difference, except max norm in real should be exact.
+                                real_t eps = std::numeric_limits<real_t>::epsilon();
+                                real_t tol;
+                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
+                                    tol = 0;
+                                else
+                                    tol = 10*eps;
+
+                                if (A.mpiRank() == 0) {
+                                    // if peak is nan, expect A_norm to be nan,
+                                    // except in Unit case with i == j and ii == jj,
+                                    // where peak shouldn't affect A_norm.
+                                    bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
+                                                 ? std::isnan(A_norm)
+                                                 : error <= tol);
+                                    params.okay() = params.okay() && okay;
+                                    if (verbose || ! okay) {
+                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                               real(peak), A_norm, A_norm_ref, error,
+                                               (okay ? "pass" : "failed"));
+                                    }
                                 }
-                            }
 
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                if (A.tileIsLocal(i, j)) {
+                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                    auto T = A(i, j);
+                                    T.at(ii, jj) = save;
+                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                }
                             }
                         }
                     }
@@ -338,7 +306,7 @@ void test_trnorm_work(Params& params, bool run)
         SLATE_UNUSED( ref );
         SLATE_UNUSED( extended );
         SLATE_UNUSED( verbose );
-        if (mpi_rank == 0)
+        if (A.mpiRank() == 0)
             printf( "ScaLAPACK not available\n" );
     #endif
 }
diff --git a/test/test_utils.hh b/test/test_utils.hh
index c16d334a7..f04ecb4b8 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -13,7 +13,7 @@
 ///
 /// @return true if the configuration should be skipped
 ///
-inline bool is_invalid_parameters(Params& params)
+inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = false)
 {
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
@@ -36,7 +36,7 @@ inline bool is_invalid_parameters(Params& params)
     }
 
     #ifdef SLATE_HAVE_SCALAPACK
-        if (nonuniform_nb && params.ref()) {
+        if (!keep_nonuniform_ref && nonuniform_nb && params.ref()) {
             params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
             params.ref() = false;
         }
@@ -128,7 +128,71 @@ void matrix_iterator(
         }
     }
     else {
-        assert( false );
+        bool is_upper = (A.uplo() == slate::Uplo::Upper);
+
+        int64_t A_j = 0, A_jj = 0;
+        for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
+
+            int64_t A_i = 0, A_ii = 0;
+            for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+
+                bool right_uplo = is_upper ? (B_i <= B_j) : (B_i >= B_j);
+                if (right_uplo) {
+
+                    #pragma omp task shared(A, B) firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
+                    {
+                        int tag = A_i + A_j * A.mt();
+                        if (B.tileIsLocal( B_i, B_j )) {
+                            A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
+                                        slate::Layout::ColMajor, tag );
+
+                            A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor );
+                            B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor );
+                            auto TA = A( A_i, A_j );
+                            auto TB = B( B_i, B_j );
+                            int64_t mb = TB.mb();
+                            int64_t nb = TB.nb();
+                            assert( A_ii + mb <= TA.mb() );
+                            assert( A_jj + nb <= TA.nb() );
+                            int64_t lda = TA.stride();
+                            int64_t ldb = TB.stride();
+                            scalar_t const* TA_data = TA.data();
+                            scalar_t*       TB_data = TB.data();
+
+                            for (int64_t jj = 0; jj < nb; ++jj) {
+                                int64_t ii_start = 0, ii_end = mb;
+                                if (B_i == B_j) { // diagonal tile
+                                    if (is_upper)
+                                        ii_end = std::min(jj+1, mb);
+                                    else
+                                        ii_start = jj;
+                                }
+                                for (int64_t ii = ii_start; ii < ii_end; ++ii) {
+                                    thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ],
+                                           TB_data[ ii + jj*ldb ] );
+                                }
+                            }
+                        }
+                        else if (A.tileIsLocal( A_i, A_j )) {
+                            A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag );
+                        }
+                    }
+                }
+
+                A_ii += B.tileMb( B_i );
+                assert( A_ii <= A.tileMb( A_i ) );
+                if (A_ii == A.tileMb( A_i )) {
+                    ++A_i;
+                    A_ii = 0;
+                }
+            }
+            A_jj += B.tileNb( B_j );
+            assert( A_jj <= A.tileNb( A_j ) );
+            if (A_jj == A.tileNb( A_j )) {
+                ++A_j;
+                A_jj = 0;
+            }
+        }
     }
 
     A.releaseRemoteWorkspace();

From 6b7636b2a1676d5880a4c25c4dcd49967659ebec Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Thu, 21 Dec 2023 11:46:11 -0500
Subject: [PATCH 20/33] Fix bug in henorm and synorm for non-uniform tile sizes

---
 src/internal/internal_henorm.cc | 25 +++++++++++++------------
 src/internal/internal_synorm.cc | 25 +++++++++++++------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc
index 7b4eecd14..36a0fe925 100644
--- a/src/internal/internal_henorm.cc
+++ b/src/internal/internal_henorm.cc
@@ -196,10 +196,11 @@ void norm(
         // Sum tile results into local results.
         // Summing up local contributions only.
         std::fill_n(values, A.n(), 0.0);
-        int64_t nb0 = A.tileNb(0);
-        int64_t mb0 = A.tileMb(0);
-        // off-diagonal blocks
+
+        jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            // off-diagonal blocks
+            int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
                 int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
@@ -210,27 +211,27 @@ void norm(
                     // col sums
                     blas::axpy(
                         nb, 1.0,
-                        &tiles_sums[A.n()*i + j*nb0 ], 1,
-                        &values[j*nb0], 1);
+                        &tiles_sums[A.n()*i + jj ], 1,
+                        &values[jj], 1);
                     // row sums
                     blas::axpy(
                         mb, 1.0,
-                        &tiles_sums[A.m()*j + i*nb0 ], 1,
-                        &values[i*mb0], 1);
+                        &tiles_sums[A.m()*j + ii ], 1,
+                        &values[ii], 1);
                 }
+                ii += A.tileMb(i);
             }
-        }
 
-        // diagonal blocks
-        for (int64_t j = 0; j < A.nt(); ++j) {
+            // diagonal blocks
             int64_t nb = A.tileNb(j);
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
                     nb, 1.0,
-                    &tiles_sums[A.n()*j + j*nb0 ], 1,
-                    &values[j*nb0], 1);
+                    &tiles_sums[A.n()*j + jj ], 1,
+                    &values[jj], 1);
             }
+            jj += nb;
         }
     }
     //---------
diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc
index 9f814b898..9a0b900cc 100644
--- a/src/internal/internal_synorm.cc
+++ b/src/internal/internal_synorm.cc
@@ -195,10 +195,11 @@ void norm(
         // Sum tile results into local results.
         // Summing up local contributions only.
         std::fill_n(values, A.n(), 0.0);
-        int64_t nb0 = A.tileNb(0);
-        int64_t mb0 = A.tileMb(0);
-        // off-diagonal blocks
+
+        jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            // off-diagonal blocks
+            int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
                 int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
@@ -209,27 +210,27 @@ void norm(
                     // col sums
                     blas::axpy(
                         nb, 1.0,
-                        &tiles_sums[A.n()*i + j*nb0 ], 1,
-                        &values[j*nb0], 1);
+                        &tiles_sums[A.n()*i + jj ], 1,
+                        &values[jj], 1);
                     // row sums
                     blas::axpy(
                         mb, 1.0,
-                        &tiles_sums[A.m()*j + i*nb0 ], 1,
-                        &values[i*mb0], 1);
+                        &tiles_sums[A.m()*j + ii ], 1,
+                        &values[ii], 1);
                 }
+                ii += A.tileMb(i);
             }
-        }
 
-        // diagonal blocks
-        for (int64_t j = 0; j < A.nt(); ++j) {
+            // diagonal blocks
             int64_t nb = A.tileNb(j);
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
                     nb, 1.0,
-                    &tiles_sums[A.n()*j + j*nb0 ], 1,
-                    &values[j*nb0], 1);
+                    &tiles_sums[A.n()*j + jj ], 1,
+                    &values[jj], 1);
             }
+            jj += nb;
         }
     }
     //---------

From 5be01cfeac1852a5efb149ecb26b794149c610af Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Thu, 21 Dec 2023 11:46:42 -0500
Subject: [PATCH 21/33] Reduce code duplication in matrix_iterator

---
 test/test_utils.hh | 111 ++++++++++++---------------------------------
 1 file changed, 28 insertions(+), 83 deletions(-)

diff --git a/test/test_utils.hh b/test/test_utils.hh
index f04ecb4b8..b4fbba6db 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -73,13 +73,18 @@ void matrix_iterator(
 
     int64_t B_mt = B.mt();
     int64_t B_nt = B.nt();
-    if constexpr (std::is_same<matrix_type, slate::Matrix<scalar_t>>::value) {
 
-        int64_t A_j = 0, A_jj = 0;
-        for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
+    constexpr bool is_general = std::is_same_v<matrix_type, slate::Matrix<scalar_t>>;
+    assert( is_general == (A.uplo() == slate::Uplo::General) );
+    bool is_upper = (A.uplo() == slate::Uplo::Upper);
 
-            int64_t A_i = 0, A_ii = 0;
-            for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+    int64_t A_j = 0, A_jj = 0;
+    for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
+
+        int64_t A_i = 0, A_ii = 0;
+        for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+
+            if (is_general || (is_upper ? (B_i <= B_j) : (B_i >= B_j))) {
                 #pragma omp task shared(A, B) \
                                  firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
                 {
@@ -101,7 +106,14 @@ void matrix_iterator(
                         scalar_t const* TA_data = TA.data();
                         scalar_t*       TB_data = TB.data();
                         for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = 0; ii < mb; ++ii) {
+                            int64_t ii_start = 0, ii_end = mb;
+                            if (!is_general && B_i == B_j) { // diagonal tile
+                                if (is_upper)
+                                    ii_end = std::min(jj+1, mb);
+                                else
+                                    ii_start = jj;
+                            }
+                            for (int64_t ii = ii_start; ii < ii_end; ++ii) {
                                 thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ],
                                        TB_data[ ii + jj*ldb ] );
                             }
@@ -112,86 +124,19 @@ void matrix_iterator(
                     }
                 }
 
-                A_ii += B.tileMb( B_i );
-                assert( A_ii <= A.tileMb( A_i ) );
-                if (A_ii == A.tileMb( A_i )) {
-                    ++A_i;
-                    A_ii = 0;
-                }
             }
-            A_jj += B.tileNb( B_j );
-            assert( A_jj <= A.tileNb( A_j ) );
-            if (A_jj == A.tileNb( A_j )) {
-                ++A_j;
-                A_jj = 0;
+            A_ii += B.tileMb( B_i );
+            assert( A_ii <= A.tileMb( A_i ) );
+            if (A_ii == A.tileMb( A_i )) {
+                ++A_i;
+                A_ii = 0;
             }
         }
-    }
-    else {
-        bool is_upper = (A.uplo() == slate::Uplo::Upper);
-
-        int64_t A_j = 0, A_jj = 0;
-        for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
-
-            int64_t A_i = 0, A_ii = 0;
-            for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
-
-                bool right_uplo = is_upper ? (B_i <= B_j) : (B_i >= B_j);
-                if (right_uplo) {
-
-                    #pragma omp task shared(A, B) firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
-                    {
-                        int tag = A_i + A_j * A.mt();
-                        if (B.tileIsLocal( B_i, B_j )) {
-                            A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
-                                        slate::Layout::ColMajor, tag );
-
-                            A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor );
-                            B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor );
-                            auto TA = A( A_i, A_j );
-                            auto TB = B( B_i, B_j );
-                            int64_t mb = TB.mb();
-                            int64_t nb = TB.nb();
-                            assert( A_ii + mb <= TA.mb() );
-                            assert( A_jj + nb <= TA.nb() );
-                            int64_t lda = TA.stride();
-                            int64_t ldb = TB.stride();
-                            scalar_t const* TA_data = TA.data();
-                            scalar_t*       TB_data = TB.data();
-
-                            for (int64_t jj = 0; jj < nb; ++jj) {
-                                int64_t ii_start = 0, ii_end = mb;
-                                if (B_i == B_j) { // diagonal tile
-                                    if (is_upper)
-                                        ii_end = std::min(jj+1, mb);
-                                    else
-                                        ii_start = jj;
-                                }
-                                for (int64_t ii = ii_start; ii < ii_end; ++ii) {
-                                    thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ],
-                                           TB_data[ ii + jj*ldb ] );
-                                }
-                            }
-                        }
-                        else if (A.tileIsLocal( A_i, A_j )) {
-                            A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag );
-                        }
-                    }
-                }
-
-                A_ii += B.tileMb( B_i );
-                assert( A_ii <= A.tileMb( A_i ) );
-                if (A_ii == A.tileMb( A_i )) {
-                    ++A_i;
-                    A_ii = 0;
-                }
-            }
-            A_jj += B.tileNb( B_j );
-            assert( A_jj <= A.tileNb( A_j ) );
-            if (A_jj == A.tileNb( A_j )) {
-                ++A_j;
-                A_jj = 0;
-            }
+        A_jj += B.tileNb( B_j );
+        assert( A_jj <= A.tileNb( A_j ) );
+        if (A_jj == A.tileNb( A_j )) {
+            ++A_j;
+            A_jj = 0;
         }
     }
 

From 31d84c3db165a049a59df693c267ad5cf634c2fc Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Tue, 26 Dec 2023 16:18:56 -0500
Subject: [PATCH 22/33] Improve some warning messages

---
 test/test_utils.hh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/test/test_utils.hh b/test/test_utils.hh
index b4fbba6db..ab1611665 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -26,19 +26,25 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal
     }
 
     if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK";
+        params.msg() = "skipping: dev_dist = Col tile not supported with origin=ScaLAPACK";
         return true;
     }
 
     if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
+        params.msg() = "skipping: nonuniform tile not supported with origin=ScaLAPACK";
         return true;
     }
 
     #ifdef SLATE_HAVE_SCALAPACK
-        if (!keep_nonuniform_ref && nonuniform_nb && params.ref()) {
+        if (!keep_nonuniform_ref && nonuniform_nb && params.ref() != 'n') {
             params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-            params.ref() = false;
+            if (params.ref() == 'o') {
+                // If ref=='o', the user doesn't want to run SLATE version
+                return true;
+            }
+            else {
+                params.ref() = 'n';
+            }
         }
     #else
         // Can only run ref when we have ScaLAPACK

From 8e00a465f840025cd9f9bdd00b75333b2d57a14b Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 27 Dec 2023 14:10:10 -0500
Subject: [PATCH 23/33] Delete unused header

---
 test/scalapack_support_routines.hh | 191 -----------------------------
 test/test_add.cc                   |   1 -
 test/test_bdsqr.cc                 |   1 -
 test/test_copy.cc                  |   1 -
 test/test_gbnorm.cc                |   1 -
 test/test_gecondest.cc             |   1 -
 test/test_gelqf.cc                 |   1 -
 test/test_gels.cc                  |   1 -
 test/test_gemm.cc                  |   1 -
 test/test_geqrf.cc                 |   1 -
 test/test_gesv.cc                  |   1 -
 test/test_getri.cc                 |   1 -
 test/test_hb2st.cc                 |   1 -
 test/test_hbnorm.cc                |   1 -
 test/test_heev.cc                  |   1 -
 test/test_hegst.cc                 |   1 -
 test/test_hegv.cc                  |   1 -
 test/test_hesv.cc                  |   1 -
 test/test_pocondest.cc             |   1 -
 test/test_potri.cc                 |   1 -
 test/test_scale_row_col.cc         |   1 -
 test/test_set.cc                   |   1 -
 test/test_stedc.cc                 |   1 -
 test/test_stedc_deflate.cc         |   1 -
 test/test_stedc_secular.cc         |   1 -
 test/test_stedc_sort.cc            |   1 -
 test/test_stedc_z_vector.cc        |   1 -
 test/test_steqr2.cc                |   1 -
 test/test_sterf.cc                 |   2 +-
 test/test_tb2bd.cc                 |   1 -
 test/test_tbsm.cc                  |   1 -
 test/test_trcondest.cc             |   1 -
 test/test_trtri.cc                 |   1 -
 test/test_unmqr.cc                 |   1 -
 34 files changed, 1 insertion(+), 224 deletions(-)
 delete mode 100644 test/scalapack_support_routines.hh

diff --git a/test/scalapack_support_routines.hh b/test/scalapack_support_routines.hh
deleted file mode 100644
index 458dc7e42..000000000
--- a/test/scalapack_support_routines.hh
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2009-2023, University of Tennessee. All rights reserved.
-// Copyright (c) 2010,      University of Denver, Colorado.
-// SPDX-License-Identifier: BSD-3-Clause
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
-
-#ifndef SLATE_SCALAPACK_SUPPORT_HH
-#define SLATE_SCALAPACK_SUPPORT_HH
-
-#include <complex>
-
-#include "scalapack_wrappers.hh"
-
-//------------------------------------------------------------------------------
-// Matrix generation
-#define Rnd64_A  6364136223846793005ULL
-#define Rnd64_C  1ULL
-#define RndF_Mul 5.4210108624275222e-20f
-#define RndD_Mul 5.4210108624275222e-20
-
-typedef unsigned long long ull;
-
-static inline ull Rnd64_jump(ull n, ull seed)
-{
-    ull a_k, c_k, ran;
-    int64_t i;
-
-    a_k = Rnd64_A;
-    c_k = Rnd64_C;
-    ran = seed;
-    for (i = 0; n; n >>= 1, ++i) {
-        if (n & 1)
-            ran = a_k*ran + c_k;
-        c_k *= (a_k + 1);
-        a_k *= a_k;
-    }
-    return ran;
-}
-
-template<typename scalar_t>
-static inline void CORE_plrnt(int64_t m, int64_t n, scalar_t* A, int64_t lda,
-                              int64_t bigM, int64_t m0, int64_t n0, ull seed)
-{
-    scalar_t* tmp = A;
-    int64_t i, j;
-    ull ran, jump;
-
-    jump = (ull)m0 + (ull)n0*(ull)bigM;
-    for (j = 0; j < n; ++j) {
-        ran = Rnd64_jump(jump, (ull)seed);
-        for (i = 0; i < m; ++i) {
-            *tmp = 0.5f - ran*RndF_Mul;
-            ran  = Rnd64_A*ran + Rnd64_C;
-            ++tmp;
-        }
-        tmp  += lda - i;
-        jump += bigM;
-    }
-}
-
-template<typename scalar_t>
-static inline void CORE_plghe(scalar_t bump, int64_t m, int64_t n, scalar_t* A, int64_t lda,
-                              int64_t gM, int64_t m0, int64_t n0, ull seed)
-{
-    scalar_t* tmp = A;
-    int64_t i, j;
-    ull ran, jump;
-
-    jump = (ull)m0 + (ull)n0*(ull)gM;
-    /* Tile diagonal */
-    if (m0 == n0) {
-        for (j = 0; j < n; ++j) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (i = j; i < m; ++i) {
-                *tmp = 0.5f - ran*RndF_Mul;
-                ran  = Rnd64_A*ran + Rnd64_C;
-                ++tmp;
-            }
-            tmp  += (lda - i + j + 1);
-            jump += gM + 1;
-        }
-        for (j = 0; j < n; ++j) {
-            A[j + j*lda] += bump;
-
-            for (i = 0; i < j; ++i)
-                A[lda*j + i] = A[lda*i + j];
-        }
-    }
-
-    /* Lower part */
-    else if (m0 > n0) {
-        for (j = 0; j < n; ++j) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (i = 0; i < m; ++i) {
-                *tmp = 0.5f - ran*RndF_Mul;
-                ran  = Rnd64_A*ran + Rnd64_C;
-                ++tmp;
-            }
-            tmp  += (lda - i);
-            jump += gM;
-        }
-    }
-
-    /* Upper part */
-    else if (m0 < n0) {
-        /* Overwrite jump */
-        jump = (ull)n0 + (ull)m0*(ull)gM;
-
-        for (i = 0; i < m; ++i) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (j = 0; j < n; ++j) {
-                A[j*lda + i] = 0.5f - ran*RndF_Mul;
-                ran = Rnd64_A*ran + Rnd64_C;
-            }
-            jump += gM;
-        }
-    }
-}
-
-template<typename scalar_t>
-static void scalapack_pplrnt(scalar_t* A,
-                             int64_t m, int64_t n,
-                             int64_t mb, int64_t nb,
-                             blas_int myrow, blas_int mycol,
-                             blas_int nprow, blas_int npcol,
-                             int64_t lldA,
-                             int64_t seed)
-{
-    blas_int idum1, idum2, iloc, jloc, i0 = 0;
-    blas_int tempm, tempn;
-    scalar_t* Ab;
-    blas_int mb_ = blas_int( mb );
-    blas_int nb_ = blas_int( nb );
-
-    // #pragma omp parallel for
-    for (blas_int i = 1; i <= m; i += mb) {
-        for (blas_int j = 1; j <= n; j += nb) {
-            if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) &&
-                (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) {
-                iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow);
-                jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol);
-
-                Ab =  &A[(jloc - 1)*lldA + (iloc - 1) ];
-                tempm = (m - i + 1) > mb ? mb : (m - i + 1);
-                tempn = (n - j + 1) > nb ? nb : (n - j + 1);
-                CORE_plrnt(tempm, tempn, Ab, lldA,
-                           m, (i - 1), (j - 1), seed);
-            }
-        }
-    }
-}
-
-
-template<typename scalar_t>
-static void scalapack_pplghe(scalar_t* A,
-                             int64_t m, int64_t n,
-                             int64_t mb, int64_t nb,
-                             blas_int myrow, blas_int mycol,
-                             blas_int nprow, blas_int npcol,
-                             int64_t lldA,
-                             int64_t seed)
-{
-    blas_int idum1, idum2, iloc, jloc, i0 = 0;
-    int64_t tempm, tempn;
-    scalar_t* Ab;
-    scalar_t bump = (scalar_t)m;
-    blas_int mb_ = blas_int( mb );
-    blas_int nb_ = blas_int( nb );
-
-    // #pragma omp parallel for
-    for (blas_int i = 1; i <= m; i += mb) {
-        for (blas_int j = 1; j <= n; j += nb) {
-            if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) &&
-                (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) {
-                iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow);
-                jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol);
-
-                Ab =  &A[(jloc - 1)*lldA + (iloc - 1) ];
-                tempm = (m - i + 1) > mb ? mb : (m - i + 1);
-                tempn = (n - j + 1) > nb ? nb : (n - j + 1);
-                CORE_plghe(bump, tempm, tempn, Ab, lldA,
-                           m, (i - 1), (j - 1), seed);
-            }
-        }
-    }
-}
-
-#endif // SLATE_SCALAPACK_SUPPORT_HH
diff --git a/test/test_add.cc b/test/test_add.cc
index b6fa65700..0d56f8268 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
diff --git a/test/test_bdsqr.cc b/test/test_bdsqr.cc
index 8e4ac041f..b3ffa96f0 100644
--- a/test/test_bdsqr.cc
+++ b/test/test_bdsqr.cc
@@ -7,7 +7,6 @@
 #include "blas.hh"
 #include "test.hh"
 #include "print_matrix.hh"
-#include "scalapack_support_routines.hh"
 #include "internal/internal.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_copy.cc b/test/test_copy.cc
index f8aa9bd1c..1e8dfb99a 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
diff --git a/test/test_gbnorm.cc b/test/test_gbnorm.cc
index 883d07aff..7ac9a510d 100644
--- a/test/test_gbnorm.cc
+++ b/test/test_gbnorm.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_gecondest.cc b/test/test_gecondest.cc
index 9bb9badad..8f365bee8 100644
--- a/test/test_gecondest.cc
+++ b/test/test_gecondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gelqf.cc b/test/test_gelqf.cc
index 915d67455..68a84747c 100644
--- a/test/test_gelqf.cc
+++ b/test/test_gelqf.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gels.cc b/test/test_gels.cc
index 9ad6a3bbc..2d20e213a 100644
--- a/test/test_gels.cc
+++ b/test/test_gels.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index c20db8ea0..8add9e9c5 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -13,7 +13,6 @@
 #include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_geqrf.cc b/test/test_geqrf.cc
index 6ae7d8302..18c8bf114 100644
--- a/test/test_geqrf.cc
+++ b/test/test_geqrf.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index 422dc686a..17c7672f8 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -14,7 +14,6 @@
 #include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_getri.cc b/test/test_getri.cc
index 6a0343e52..3693dcba1 100644
--- a/test/test_getri.cc
+++ b/test/test_getri.cc
@@ -10,7 +10,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hb2st.cc b/test/test_hb2st.cc
index 5c3f4bce7..607250fe1 100644
--- a/test/test_hb2st.cc
+++ b/test/test_hb2st.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
-#include "scalapack_support_routines.hh"
 
 #include <cmath>
 #include <cstdio>
diff --git a/test/test_hbnorm.cc b/test/test_hbnorm.cc
index c62de133a..bf5c5cd2c 100644
--- a/test/test_hbnorm.cc
+++ b/test/test_hbnorm.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
diff --git a/test/test_heev.cc b/test/test_heev.cc
index 505c1c0f3..fc7077f2a 100644
--- a/test/test_heev.cc
+++ b/test/test_heev.cc
@@ -9,7 +9,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hegst.cc b/test/test_hegst.cc
index 93d78d275..35574087d 100644
--- a/test/test_hegst.cc
+++ b/test/test_hegst.cc
@@ -10,7 +10,6 @@
 #include "print_matrix.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "auxiliary/Debug.hh"
 #include "grid_utils.hh"
diff --git a/test/test_hegv.cc b/test/test_hegv.cc
index d342be443..f6061b0a1 100644
--- a/test/test_hegv.cc
+++ b/test/test_hegv.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hesv.cc b/test/test_hesv.cc
index 2e8f54af1..8dbe4b83c 100644
--- a/test/test_hesv.cc
+++ b/test/test_hesv.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "grid_utils.hh"
 #include "print_matrix.hh"
diff --git a/test/test_pocondest.cc b/test/test_pocondest.cc
index 47d45109e..081e03e29 100644
--- a/test/test_pocondest.cc
+++ b/test/test_pocondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_potri.cc b/test/test_potri.cc
index 647181acd..07d926ffc 100644
--- a/test/test_potri.cc
+++ b/test/test_potri.cc
@@ -10,7 +10,6 @@
 #include "print_matrix.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index e239beb0d..e538476ee 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
diff --git a/test/test_set.cc b/test/test_set.cc
index 5e1fdf024..9b91e7583 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
diff --git a/test/test_stedc.cc b/test/test_stedc.cc
index b28271c10..73ff18ec9 100644
--- a/test/test_stedc.cc
+++ b/test/test_stedc.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 #include "matrix_generator.hh"
diff --git a/test/test_stedc_deflate.cc b/test/test_stedc_deflate.cc
index 9ad2dedd8..077710af2 100644
--- a/test/test_stedc_deflate.cc
+++ b/test/test_stedc_deflate.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_secular.cc b/test/test_stedc_secular.cc
index 2e5aeabaf..e321bb8cd 100644
--- a/test/test_stedc_secular.cc
+++ b/test/test_stedc_secular.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_sort.cc b/test/test_stedc_sort.cc
index 96aa1d451..26eff3fd7 100644
--- a/test/test_stedc_sort.cc
+++ b/test/test_stedc_sort.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_z_vector.cc b/test/test_stedc_z_vector.cc
index bc8a5013f..64ca668a2 100644
--- a/test/test_stedc_z_vector.cc
+++ b/test/test_stedc_z_vector.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_steqr2.cc b/test/test_steqr2.cc
index 4d092669e..ff5ec29b9 100644
--- a/test/test_steqr2.cc
+++ b/test/test_steqr2.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_sterf.cc b/test/test_sterf.cc
index 1d0a7f2a5..5488ef7fe 100644
--- a/test/test_sterf.cc
+++ b/test/test_sterf.cc
@@ -7,7 +7,7 @@
 #include "blas.hh"
 #include "test.hh"
 #include "print_matrix.hh"
-#include "scalapack_support_routines.hh"
+
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_tb2bd.cc b/test/test_tb2bd.cc
index a1ab1aafa..29204bda2 100644
--- a/test/test_tb2bd.cc
+++ b/test/test_tb2bd.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
-#include "scalapack_support_routines.hh"
 
 #include <cmath>
 #include <cstdio>
diff --git a/test/test_tbsm.cc b/test/test_tbsm.cc
index 2442245b4..b7204bb90 100644
--- a/test/test_tbsm.cc
+++ b/test/test_tbsm.cc
@@ -8,7 +8,6 @@
 #include "blas/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_trcondest.cc b/test/test_trcondest.cc
index 74fbac410..1301854fb 100644
--- a/test/test_trcondest.cc
+++ b/test/test_trcondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_trtri.cc b/test/test_trtri.cc
index 2cd7f6daf..e0c0d9435 100644
--- a/test/test_trtri.cc
+++ b/test/test_trtri.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
diff --git a/test/test_unmqr.cc b/test/test_unmqr.cc
index d7270b303..87eff3ecf 100644
--- a/test/test_unmqr.cc
+++ b/test/test_unmqr.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>

From faddc11a772bed6879bdfdfcd9b3af4cdde18437 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 27 Dec 2023 14:22:35 -0500
Subject: [PATCH 24/33] Cleanup some header include's

---
 test/test_add.cc           | 1 -
 test/test_copy.cc          | 1 -
 test/test_genorm.cc        | 1 -
 test/test_gesv.cc          | 1 -
 test/test_hemm.cc          | 1 -
 test/test_henorm.cc        | 1 -
 test/test_her2k.cc         | 1 -
 test/test_herk.cc          | 1 -
 test/test_posv.cc          | 1 -
 test/test_scale.cc         | 1 -
 test/test_scale_row_col.cc | 1 -
 test/test_set.cc           | 1 -
 test/test_svd.cc           | 1 -
 test/test_symm.cc          | 1 -
 test/test_synorm.cc        | 1 -
 test/test_syr2k.cc         | 1 -
 test/test_syrk.cc          | 1 -
 test/test_trmm.cc          | 1 -
 test/test_trnorm.cc        | 1 -
 test/test_trsm.cc          | 1 -
 test/test_utils.hh         | 1 +
 21 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/test/test_add.cc b/test/test_add.cc
index 0d56f8268..0596c73f8 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_copy.cc b/test/test_copy.cc
index 1e8dfb99a..d7b3bf0fc 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_genorm.cc b/test/test_genorm.cc
index 31f7683df..41c03cee5 100644
--- a/test/test_genorm.cc
+++ b/test/test_genorm.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index 17c7672f8..e8d7b96e5 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_hemm.cc b/test/test_hemm.cc
index a31612b53..d874c40ac 100644
--- a/test/test_hemm.cc
+++ b/test/test_hemm.cc
@@ -8,7 +8,6 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_henorm.cc b/test/test_henorm.cc
index d18a38792..e03a6ac30 100644
--- a/test/test_henorm.cc
+++ b/test/test_henorm.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_her2k.cc b/test/test_her2k.cc
index fc8d4077b..7fe3ed067 100644
--- a/test/test_her2k.cc
+++ b/test/test_her2k.cc
@@ -8,7 +8,6 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_herk.cc b/test/test_herk.cc
index d3f9144ba..2bbef4b7a 100644
--- a/test/test_herk.cc
+++ b/test/test_herk.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 161413a70..8b2d6ef69 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_scale.cc b/test/test_scale.cc
index 0d05b056d..c04652e39 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index e538476ee..18e0819fd 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_set.cc b/test/test_set.cc
index 9b91e7583..85eed4200 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_svd.cc b/test/test_svd.cc
index 7189293a1..36c4e9ba3 100644
--- a/test/test_svd.cc
+++ b/test/test_svd.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_symm.cc b/test/test_symm.cc
index e463b5d34..8d6ebed19 100644
--- a/test/test_symm.cc
+++ b/test/test_symm.cc
@@ -8,7 +8,6 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_synorm.cc b/test/test_synorm.cc
index 1145006fe..fa8d3b250 100644
--- a/test/test_synorm.cc
+++ b/test/test_synorm.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc
index a42aaf6c9..cf051cabd 100644
--- a/test/test_syr2k.cc
+++ b/test/test_syr2k.cc
@@ -8,7 +8,6 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_syrk.cc b/test/test_syrk.cc
index bc8e6e1d0..bae13cf10 100644
--- a/test/test_syrk.cc
+++ b/test/test_syrk.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_trmm.cc b/test/test_trmm.cc
index 941ee2ecb..a8b6d08e6 100644
--- a/test/test_trmm.cc
+++ b/test/test_trmm.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc
index 4cd1a4cbc..2f650fdeb 100644
--- a/test/test_trnorm.cc
+++ b/test/test_trnorm.cc
@@ -10,7 +10,6 @@
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_trsm.cc b/test/test_trsm.cc
index 5f4bd7045..d1b8affe4 100644
--- a/test/test_trsm.cc
+++ b/test/test_trsm.cc
@@ -8,7 +8,6 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 
-#include "grid_utils.hh"
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
diff --git a/test/test_utils.hh b/test/test_utils.hh
index ab1611665..6c25788d3 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -7,6 +7,7 @@
 #define SLATE_TEST_UTILS_HH
 
 #include "slate/slate.hh"
+#include "test.hh"
 
 ///-----------------------------------------------------------------------------
 /// Checks for common invalid parameter combinations

From 3d3d1528e531729d3d3c1ede91ef078532ff47a3 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 27 Dec 2023 14:51:43 -0500
Subject: [PATCH 25/33] Make norm testers more consistent

---
 test/test_henorm.cc | 256 +++++++++++++++++++---------------------
 test/test_synorm.cc | 256 +++++++++++++++++++---------------------
 test/test_trnorm.cc | 276 +++++++++++++++++++++-----------------------
 3 files changed, 373 insertions(+), 415 deletions(-)

diff --git a/test/test_henorm.cc b/test/test_henorm.cc
index e03a6ac30..a9a24a9fe 100644
--- a/test/test_henorm.cc
+++ b/test/test_henorm.cc
@@ -97,24 +97,22 @@ void test_henorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // comparison with reference routine from ScaLAPACK
-
-        // initialize BLACS and ScaLAPACK
-        blas_int ictxt, A_desc[9];
-        A_alloc.create_ScaLAPACK_context( &ictxt );
-        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
+            // comparison with reference routine from ScaLAPACK
 
-        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
-        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
-            A_data.resize( A_alloc.lld * A_alloc.nloc );
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
 
-            copy(A, &A_data[0], A_desc);
-        }
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
 
-        if (check || ref) {
-            // comparison with reference routine from ScaLAPACK
+                copy(A, &A_data[0], A_desc);
+            }
 
             // allocate work space
             int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
@@ -122,6 +120,8 @@ void test_henorm_work(Params& params, bool run)
             int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
             std::vector<real_t> worklanhe( lwork );
 
+            // comparison with reference routine from ScaLAPACK
+
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -158,145 +158,131 @@ void test_henorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            if (grid_order != slate::GridOrder::Col) {
-                printf("WARNING: cannot do extended tests with row-major grid\n");
-            }
-            else {
-                // allocate work space
-                int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
-                                          scalapack_ilcm( p, q ) / p );
-                int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
-                std::vector<real_t> worklanhe(lwork);
-
-                // seed all MPI processes the same
-                srand(1234);
-
-                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-                // up to 64 tiles total.
-                // Indices may be out-of-bounds if nt is small, so check in loops.
-                int64_t nt = A.nt();
-                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-                for (size_t k = 0; k < 4; ++k) {
-                    j_indices.insert(rand() % nt);
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
                 }
-                for (auto j : j_indices) {
-                    if (j < 0 || j >= nt)
-                        continue;
-                    int64_t jb = std::min(n - j*nb, nb);
-                    slate_assert(jb == A.tileNb(j));
-
-                    for (auto i : j_indices) {
-                        // lower requires i >= j
-                        // upper requires i <= j
-                        if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if nt is small, so check in loops.
+                    int64_t nt = A.nt();
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
-                        int64_t ib = std::min(n - i*nb, nb);
-                        slate_assert(ib == A.tileMb(i));
-
-                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                        // up to 25 entries per tile.
-                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                        // todo: complex peak
-                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                        if (rand() < RAND_MAX / 2)
-                            peak *= -1;
-                        if (rand() < RAND_MAX / 20)
-                            peak = nan("");
-                        scalar_t save = 0;
-
-                        for (auto jj : jj_indices) {
-                            if (jj < 0 || jj >= jb)
-                                continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                            for (auto ii : ii_indices) {
-                                if (ii < 0 || ii >= ib
-                                    || (i == j && (uplo == slate::Uplo::Lower
-                                                   ? ii < jj
-                                                   : ii > jj))) {
+                        for (auto i : j_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                                continue;
+                            int64_t ib = std::min(n - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
                                     continue;
-                                }
 
-                                int64_t ilocal = int(i / p)*nb + ii;
-                                int64_t jlocal = int(j / q)*nb + jj;
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    save = T(ii, jj);
-                                    T.at(ii, jj) = peak;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                                }
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
 
-                                A_norm = slate::norm(norm, A, opts);
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
 
-                                real_t A_norm_ref = scalapack_planhe(
-                                                        norm2str(norm), uplo2str(A.uplo()),
-                                                        n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
+                                    A_norm = slate::norm(norm, A, opts);
 
-                                // difference between norms
-                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                                if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                    error /= sqrt(n);
-                                }
-                                else if (norm == slate::Norm::Fro) {
-                                    error /= sqrt(n*n);
-                                }
+                                    A_norm_ref = scalapack_planhe(
+                                                     norm2str(norm), uplo2str(A.uplo()),
+                                                     n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
 
-                                // Allow for difference, except max norm in real should be exact.
-                                real_t eps = std::numeric_limits<real_t>::epsilon();
-                                real_t tol;
-                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                    tol = 0;
-                                else
-                                    tol = 10*eps;
-
-                                if (A.mpiRank() == 0) {
-                                    // if peak is nan, expect A_norm to be nan.
-                                    bool okay = (std::isnan(real(peak))
-                                                 ? std::isnan(A_norm)
-                                                 : error <= tol);
-                                    params.okay() = params.okay() && okay;
-                                    if (verbose || ! okay) {
-                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                               real(peak), A_norm, A_norm_ref, error,
-                                               (okay ? "pass" : "failed"));
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(n*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
                                     }
-                                }
 
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    T.at(ii, jj) = save;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else  // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if ((check || ref) && A.mpiRank() == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else  // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/test/test_synorm.cc b/test/test_synorm.cc
index fa8d3b250..eb3dfc025 100644
--- a/test/test_synorm.cc
+++ b/test/test_synorm.cc
@@ -95,30 +95,28 @@ void test_synorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // comparison with reference routine from ScaLAPACK
-
-        // initialize BLACS and ScaLAPACK
-        blas_int ictxt, A_desc[9];
-        A_alloc.create_ScaLAPACK_context( &ictxt );
-        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
+            // comparison with reference routine from ScaLAPACK
 
-        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
-        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
-            A_data.resize( A_alloc.lld * A_alloc.nloc );
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
 
-            copy(A, &A_data[0], A_desc);
-        }
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
 
-        if (check || ref) {
-            // comparison with reference routine from ScaLAPACK
+                copy(A, &A_data[0], A_desc);
+            }
 
             // allocate work space
             int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
                                       scalapack_ilcm( p, q ) / p );
             int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
-            std::vector<real_t> worklansy(lwork);
+            std::vector<real_t> worklansy( lwork );
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -156,145 +154,131 @@ void test_synorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            if (grid_order != slate::GridOrder::Col) {
-                printf("WARNING: cannot do extended tests with row-major grid\n");
-            }
-            else {
-                // allocate work space
-                int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
-                                          scalapack_ilcm( p, q ) / p );
-                int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
-                std::vector<real_t> worklansy(lwork);
-
-                // seed all MPI processes the same
-                srand(1234);
-
-                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-                // up to 64 tiles total.
-                // Indices may be out-of-bounds if nt is small, so check in loops.
-                int64_t nt = A.nt();
-                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-                for (size_t k = 0; k < 4; ++k) {
-                    j_indices.insert(rand() % nt);
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
                 }
-                for (auto j : j_indices) {
-                    if (j < 0 || j >= nt)
-                        continue;
-                    int64_t jb = std::min(n - j*nb, nb);
-                    slate_assert(jb == A.tileNb(j));
-
-                    for (auto i : j_indices) {
-                        // lower requires i >= j
-                        // upper requires i <= j
-                        if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if nt is small, so check in loops.
+                    int64_t nt = A.nt();
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
-                        int64_t ib = std::min(n - i*nb, nb);
-                        slate_assert(ib == A.tileMb(i));
-
-                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                        // up to 25 entries per tile.
-                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                        // todo: complex peak
-                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                        if (rand() < RAND_MAX / 2)
-                            peak *= -1;
-                        if (rand() < RAND_MAX / 20)
-                            peak = nan("");
-                        scalar_t save = 0;
-
-                        for (auto jj : jj_indices) {
-                            if (jj < 0 || jj >= jb)
-                                continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                            for (auto ii : ii_indices) {
-                                if (ii < 0 || ii >= ib
-                                    || (i == j && (uplo == slate::Uplo::Lower
-                                                   ? ii < jj
-                                                   : ii > jj))) {
+                        for (auto i : j_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                                continue;
+                            int64_t ib = std::min(n - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
                                     continue;
-                                }
 
-                                int64_t ilocal = int(i / p)*nb + ii;
-                                int64_t jlocal = int(j / q)*nb + jj;
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    save = T(ii, jj);
-                                    T.at(ii, jj) = peak;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                                }
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
 
-                                A_norm = slate::norm(norm, A, opts);
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
 
-                                real_t A_norm_ref = scalapack_plansy(
-                                                        norm2str(norm), uplo2str(A.uplo()),
-                                                        n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
+                                    A_norm = slate::norm(norm, A, opts);
 
-                                // difference between norms
-                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                                if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                    error /= sqrt(n);
-                                }
-                                else if (norm == slate::Norm::Fro) {
-                                    error /= sqrt(n*n);
-                                }
+                                    A_norm_ref = scalapack_plansy(
+                                                     norm2str(norm), uplo2str(A.uplo()),
+                                                     n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
 
-                                // Allow for difference, except max norm in real should be exact.
-                                real_t eps = std::numeric_limits<real_t>::epsilon();
-                                real_t tol;
-                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                    tol = 0;
-                                else
-                                    tol = 10*eps;
-
-                                if (A.mpiRank() == 0) {
-                                    // if peak is nan, expect A_norm to be nan.
-                                    bool okay = (std::isnan(real(peak))
-                                                 ? std::isnan(A_norm)
-                                                 : error <= tol);
-                                    params.okay() = params.okay() && okay;
-                                    if (verbose || ! okay) {
-                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                               real( peak ), A_norm, A_norm_ref, error,
-                                               (okay ? "pass" : "failed"));
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(n*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real( peak ), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
                                     }
-                                }
 
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    T.at(ii, jj) = save;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if ((check || ref) && A.mpiRank() == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc
index 2f650fdeb..bd8d87f67 100644
--- a/test/test_trnorm.cc
+++ b/test/test_trnorm.cc
@@ -100,28 +100,27 @@ void test_trnorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // comparison with reference routine from ScaLAPACK
-
-        // initialize BLACS and ScaLAPACK
-        blas_int ictxt, A_desc[9];
-        A_alloc.create_ScaLAPACK_context( &ictxt );
-        A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
+            // comparison with reference routine from ScaLAPACK
 
-        auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
-        if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
-            A_data.resize( A_alloc.lld * A_alloc.nloc );
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
 
-            copy(A, &A_data[0], A_desc);
-        }
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
 
-        // TODO move the above into this if statement
-        if (check || ref) {
-            // comparison with reference routine from ScaLAPACK
+                copy(A, &A_data[0], A_desc);
+            }
 
             // allocate work space
-            std::vector<real_t> worklantr(std::max(A_alloc.mloc, A_alloc.nloc));
+            std::vector<real_t> worklantr( std::max(A_alloc.mloc, A_alloc.nloc) );
+
+            // comparison with reference routine from ScaLAPACK
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -162,152 +161,141 @@ void test_trnorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            if (grid_order != slate::GridOrder::Col) {
-                printf("WARNING: cannot do extended tests with row-major grid\n");
-            }
-            else {
-                // allocate work space
-                std::vector<real_t> worklantr(std::max(A_alloc.mloc, A_alloc.nloc));
-
-                // seed all MPI processes the same
-                srand(1234);
-
-                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-                // up to 64 tiles total.
-                // Indices may be out-of-bounds if mt or nt is small, so check in loops.
-                int64_t mt = A.mt();
-                int64_t nt = A.nt();
-                std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
-                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-                for (size_t k = 0; k < 4; ++k) {
-                    i_indices.insert(rand() % mt);
-                    j_indices.insert(rand() % nt);
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
                 }
-                for (auto j : j_indices) {
-                    if (j < 0 || j >= nt)
-                        continue;
-                    int64_t jb = std::min(n - j*nb, nb);
-                    slate_assert(jb == A.tileNb(j));
-
-                    for (auto i : i_indices) {
-                        // lower requires i >= j
-                        // upper requires i <= j
-                        if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if mt or nt is small, so check in loops.
+                    int64_t mt = A.mt();
+                    int64_t nt = A.nt();
+                    std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        i_indices.insert(rand() % mt);
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
-                        int64_t ib = std::min(m - i*nb, nb);
-                        slate_assert(ib == A.tileMb(i));
-
-                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                        // up to 25 entries per tile.
-                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                        // todo: complex peak
-                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                        if (rand() < RAND_MAX / 2)
-                            peak *= -1;
-                        if (rand() < RAND_MAX / 20)
-                            peak = nan("");
-                        scalar_t save = 0;
-
-                        for (auto jj : jj_indices) {
-                            if (jj < 0 || jj >= jb)
-                                continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                            for (auto ii : ii_indices) {
-                                if (ii < 0 || ii >= ib
-                                    || (i == j && (uplo == slate::Uplo::Lower
-                                                   ? ii < jj
-                                                   : ii > jj))) {
+                        for (auto i : i_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
+                                continue;
+                            int64_t ib = std::min(m - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
                                     continue;
-                                }
 
-                                int64_t ilocal = int(i / p)*nb + ii;
-                                int64_t jlocal = int(j / q)*nb + jj;
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    save = T(ii, jj);
-                                    slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save);
-                                    T.at(ii, jj) = peak;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                                }
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
 
-                                A_norm = slate::norm(norm, A, opts);
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
 
-                                real_t A_norm_ref = scalapack_plantr(
-                                                        norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
-                                                        m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
+                                    A_norm = slate::norm(norm, A, opts);
 
-                                // difference between norms
-                                real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                                if (norm == slate::Norm::One) {
-                                    error /= sqrt(m);
-                                }
-                                else if (norm == slate::Norm::Inf) {
-                                    error /= sqrt(n);
-                                }
-                                else if (norm == slate::Norm::Fro) {
-                                    error /= sqrt(m*n);
-                                }
+                                    A_norm_ref = scalapack_plantr(
+                                                     norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
+                                                     m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
 
-                                // Allow for difference, except max norm in real should be exact.
-                                real_t eps = std::numeric_limits<real_t>::epsilon();
-                                real_t tol;
-                                if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                    tol = 0;
-                                else
-                                    tol = 10*eps;
-
-                                if (A.mpiRank() == 0) {
-                                    // if peak is nan, expect A_norm to be nan,
-                                    // except in Unit case with i == j and ii == jj,
-                                    // where peak shouldn't affect A_norm.
-                                    bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
-                                                 ? std::isnan(A_norm)
-                                                 : error <= tol);
-                                    params.okay() = params.okay() && okay;
-                                    if (verbose || ! okay) {
-                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                               real(peak), A_norm, A_norm_ref, error,
-                                               (okay ? "pass" : "failed"));
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One) {
+                                        error /= sqrt(m);
+                                    }
+                                    else if (norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(m*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan,
+                                        // except in Unit case with i == j and ii == jj,
+                                        // where peak shouldn't affect A_norm.
+                                        bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
                                     }
-                                }
 
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    T.at(ii, jj) = save;
-                                    A_data[ ilocal + jlocal*A_alloc.lld ] = save;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
-
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else  // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if (A.mpiRank() == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else  // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------

From 778a8660811d0e68b11efd903e1caaa95b1284b4 Mon Sep 17 00:00:00 2001
From: Neil Lindquist <nlindqu1@icl.utk.edu>
Date: Wed, 27 Dec 2023 14:55:37 -0500
Subject: [PATCH 26/33] Only do parameter check when doing actual test run

---
 test/test_gesv.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index e8d7b96e5..1e7b00541 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -148,14 +148,14 @@ void test_gesv_work(Params& params, bool run)
         depth = params.depth();
     }
 
+    if (! run)
+        return;
+
     // Check for common invalid combinations
     if (is_invalid_parameters( params )) {
         return;
     }
 
-    if (! run)
-        return;
-
     if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres")
         && ! std::is_same<real_t, double>::value) {
         params.msg() = "skipping: unsupported mixed precision; must be type=d or z";

From ce5359148450934772829af9c3eb053c5e675c8c Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Tue, 23 Jan 2024 14:24:49 -0500
Subject: [PATCH 27/33] style

---
 test/test_utils.hh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_utils.hh b/test/test_utils.hh
index 6c25788d3..df336b077 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -9,7 +9,7 @@
 #include "slate/slate.hh"
 #include "test.hh"
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// Checks for common invalid parameter combinations
 ///
 /// @return true if the configuration should be skipped
@@ -59,7 +59,7 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal
     return false;
 }
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// Applies the operator thunk to each element of A and B to update B.
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the
@@ -150,7 +150,7 @@ void matrix_iterator(
     A.releaseRemoteWorkspace();
 }
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// subtract_matrices takes input matrices A and B, and performs B = B - A.
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the
@@ -164,7 +164,7 @@ void subtract_matrices( matrix_type& A, matrix_type& B )
     matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } );
 }
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// copy_matrix copies A to B
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the

From 3bc5090a14644a34b87dac68af0c4737fe540023 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Tue, 23 Jan 2024 14:25:13 -0500
Subject: [PATCH 28/33] test: add trans back to getrs_tntpiv, nopiv

---
 test/run_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/run_tests.py b/test/run_tests.py
index d0ce7aace..34330b4e9 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -405,8 +405,8 @@ def filter_csv( values, csv ):
                       + ' --matrix rand_dominant' ],
 
     [ 'getrs',        gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ],
-    [ 'getrs_tntpiv', gen + dtype + la + n + ge_matrix ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + ge_matrix + nonuniform_nb
+    [ 'getrs_tntpiv', gen + dtype + la + n + trans + ge_matrix ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + trans + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
     [ 'getri',    gen + dtype + la + n ],

From 8d65640c0b89de8f421090d787bd508b20a07f52 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 24 Jan 2024 18:12:33 -0500
Subject: [PATCH 29/33] factor nb out of {sy,he}norm loops; make {sy,he}norms
 more consistent

---
 src/internal/internal_henorm.cc |  6 +++---
 src/internal/internal_synorm.cc | 14 ++++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc
index 36a0fe925..9f6cf0907 100644
--- a/src/internal/internal_henorm.cc
+++ b/src/internal/internal_henorm.cc
@@ -199,10 +199,11 @@ void norm(
 
         jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            int64_t nb = A.tileNb( j );
+
             // off-diagonal blocks
             int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
-                int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
                 if (A.tileIsLocal(i, j) &&
                     ( (  lower && i > j) ||
@@ -219,11 +220,10 @@ void norm(
                         &tiles_sums[A.m()*j + ii ], 1,
                         &values[ii], 1);
                 }
-                ii += A.tileMb(i);
+                ii += mb;
             }
 
             // diagonal blocks
-            int64_t nb = A.tileNb(j);
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc
index 9a0b900cc..0ae70972d 100644
--- a/src/internal/internal_synorm.cc
+++ b/src/internal/internal_synorm.cc
@@ -143,7 +143,7 @@ void norm(
         int64_t jj = 0;
         #pragma omp taskgroup
         for (int64_t j = 0; j < A.nt(); ++j) {
-            // diagonal tile
+            // diagonal tiles
             if (j < A.mt() && A.tileIsLocal(j, j)) {
                 #pragma omp task slate_omp_default_none \
                     shared( A, tiles_sums ) \
@@ -190,7 +190,7 @@ void norm(
             }
             jj += A.tileNb(j);
         }
-        //end omp taskgroup
+        // end omp taskgroup
 
         // Sum tile results into local results.
         // Summing up local contributions only.
@@ -198,10 +198,11 @@ void norm(
 
         jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            int64_t nb = A.tileNb( j );
+
             // off-diagonal blocks
             int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
-                int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
                 if (A.tileIsLocal(i, j) &&
                     ( (  lower && i > j) ||
@@ -218,11 +219,10 @@ void norm(
                         &tiles_sums[A.m()*j + ii ], 1,
                         &values[ii], 1);
                 }
-                ii += A.tileMb(i);
+                ii += mb;
             }
 
             // diagonal blocks
-            int64_t nb = A.tileNb(j);
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
@@ -302,6 +302,7 @@ void norm(
                 }
             }
         }
+        // end omp taskgroup
     }
 }
 
@@ -326,7 +327,8 @@ void norm(
 /// @ingroup norm_internal
 ///
 template <typename scalar_t>
-void norm(internal::TargetType<Target::Devices>,
+void norm(
+    internal::TargetType<Target::Devices>,
     Norm in_norm, NormScope scope, SymmetricMatrix<scalar_t>& A,
     blas::real_type<scalar_t>* values,
     int priority, int queue_index)

From 60041846970eafdea490e342b46f5db9c8d53d6b Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 31 Jan 2024 15:58:11 -0500
Subject: [PATCH 30/33] remove unused vals_dev_arrays

---
 src/internal/internal_henorm.cc | 4 +---
 src/internal/internal_synorm.cc | 9 +--------
 src/internal/internal_trnorm.cc | 4 +---
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc
index 9f6cf0907..992287b73 100644
--- a/src/internal/internal_henorm.cc
+++ b/src/internal/internal_henorm.cc
@@ -346,8 +346,6 @@ void norm(
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -375,7 +373,7 @@ void norm(
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
+            shared( vals_host_arrays, ijrange ) \
             firstprivate(device, layout, lower, queue_index, in_norm, ldv) \
             priority(priority)
         {
diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc
index 0ae70972d..79a5ba291 100644
--- a/src/internal/internal_synorm.cc
+++ b/src/internal/internal_synorm.cc
@@ -350,8 +350,6 @@ void norm(
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -379,7 +377,7 @@ void norm(
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
+            shared( vals_host_arrays, ijrange ) \
             firstprivate(device, lower, queue_index, in_norm, ldv, layout) \
             priority(priority)
         {
@@ -496,11 +494,6 @@ void norm(
     }
     // end omp taskgroup
 
-    for (int device = 0; device < A.num_devices(); ++device) {
-        blas::Queue* queue = A.compute_queue(device, queue_index);
-        blas::device_free(vals_dev_arrays[device], *queue);
-    }
-
     // Reduction over devices to local result.
     if (in_norm == Norm::Max) {
         *values = lapack::lange(in_norm,
diff --git a/src/internal/internal_trnorm.cc b/src/internal/internal_trnorm.cc
index 72e203c44..83cf429ae 100644
--- a/src/internal/internal_trnorm.cc
+++ b/src/internal/internal_trnorm.cc
@@ -365,8 +365,6 @@ void norm(
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -399,7 +397,7 @@ void norm(
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, irange, jrange ) \
+            shared( vals_host_arrays, irange, jrange ) \
             firstprivate(device, queue_index, in_norm, ldv, layout) \
             priority(priority)
         {

From 971dc35f325fdfd45ef1b236692694b80aa0d582 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 31 Jan 2024 16:00:41 -0500
Subject: [PATCH 31/33] use B_alloc.lld; wrap

---
 test/test_posv.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/test_posv.cc b/test/test_posv.cc
index 8b2d6ef69..398ac323b 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -385,9 +385,13 @@ void test_posv_work(Params& params, bool run)
 
             if (verbose > 2) {
                 if (origin == slate::Origin::ScaLAPACK) {
-                    slate::Debug::diffLapackMatrices<scalar_t>(n, n, &A_data[0], A_alloc.lld, &Aref_data[0], A_alloc.lld, nb, nb);
+                    slate::Debug::diffLapackMatrices<scalar_t>(
+                        n, n, &A_data[0], A_alloc.lld,
+                        &Aref_data[0], A_alloc.lld, nb, nb);
                     if (params.routine != "potrf") {
-                        slate::Debug::diffLapackMatrices<scalar_t>(n, nrhs, &B_data[0], A_alloc.lld, &Bref_data[0], A_alloc.lld, nb, nb);
+                        slate::Debug::diffLapackMatrices<scalar_t>(
+                            n, nrhs, &B_data[0], B_alloc.lld,
+                            &Bref_data[0], B_alloc.lld, nb, nb);
                     }
                 }
             }

From 564f45710a49ecd962f9e9b97a95aa4e5066470c Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 31 Jan 2024 16:05:26 -0500
Subject: [PATCH 32/33] use params.matrixC

---
 test/test_herk.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_herk.cc b/test/test_herk.cc
index 2bbef4b7a..2c86f7500 100644
--- a/test/test_herk.cc
+++ b/test/test_herk.cc
@@ -45,7 +45,7 @@ void test_herk_work(Params& params, bool run)
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
-    params.matrixB.mark();
+    params.matrixC.mark();
 
     mark_params_for_test_HermitianMatrix( params );
     mark_params_for_test_Matrix( params );
@@ -91,7 +91,7 @@ void test_herk_work(Params& params, bool run)
     auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
-    slate::generate_matrix( params.matrixB, C );
+    slate::generate_matrix( params.matrixC, C );
 
     // If reference run is required, record norms to be used in the check/ref.
     real_t A_norm=0, C_orig_norm=0;

From d1eca32c92f80df7b46546b6a2e47f8d98a3484f Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 31 Jan 2024 16:37:00 -0500
Subject: [PATCH 33/33] test: replace DevDist with GridOrder

---
 test/matrix_utils.cc |  9 ++++-----
 test/matrix_utils.hh |  2 +-
 test/run_tests.py    | 16 ++++++++--------
 test/test.cc         |  5 +++--
 test/test.hh         | 33 +--------------------------------
 test/test_utils.hh   | 10 +++++-----
 6 files changed, 22 insertions(+), 53 deletions(-)

diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
index 0de0e57e9..c751bb302 100644
--- a/test/matrix_utils.cc
+++ b/test/matrix_utils.cc
@@ -21,11 +21,11 @@ static TestMatrix<matrix_type> allocate_test_shared(
     // Load params variables
     int p = params.grid.m();
     int q = params.grid.n();
-    slate::Dist dev_dist = params.dev_dist();
     int64_t nb = params.nb();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
     slate::Origin origin = params.origin();
     slate::GridOrder grid_order = params.grid_order();
+    slate::GridOrder dev_order = params.dev_order();
 
     // The object to be returned
     TestMatrix<matrix_type> matrix( m, n, nb, p, q, grid_order );
@@ -48,14 +48,13 @@ static TestMatrix<matrix_type> allocate_test_shared(
 
     auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
     int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ),
-                                                   p, num_devices_ );
+    auto tileDevice = slate::func::device_1d_grid( dev_order, p, num_devices_ );
 
     // Setup matrix to test SLATE with
     if (origin != slate::Origin::ScaLAPACK) {
         // SLATE allocates CPU or GPU tiles.
         slate::Target origin_target = origin2target( origin );
-        if (nonuniform_nb || dev_dist == slate::Dist::Col) {
+        if (nonuniform_nb || dev_order == slate::GridOrder::Col) {
             matrix.A = construct_irregular( tileNb, tileRank, tileDevice );
         }
         else {
@@ -65,7 +64,7 @@ static TestMatrix<matrix_type> allocate_test_shared(
     }
     else {
         assert( !nonuniform_nb );
-        assert( dev_dist == slate::Dist::Row );
+        assert( dev_order == slate::GridOrder::Row );
         // Create SLATE matrix from the ScaLAPACK layouts
         matrix.A_data.resize( matrix.lld * matrix.nloc );
         matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb,
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 247a8ef12..b742b7ccf 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -265,11 +265,11 @@ inline void mark_params_for_test_Matrix(Params& params)
 {
     params.grid.m();
     params.grid.n();
-    params.dev_dist();
     params.nb();
     params.nonuniform_nb();
     params.origin();
     params.grid_order();
+    params.dev_order();
 }
 
 //------------------------------------------------------------------------------
diff --git a/test/run_tests.py b/test/run_tests.py
index 34330b4e9..574ecfc10 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -121,13 +121,13 @@
 group_opt.add_argument( '--origin', action='store', help='default=%(default)s', default='s' )
 group_opt.add_argument( '--target', action='store', help='default=%(default)s', default='t' )
 group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' )
-group_opt.add_argument( '--dev-dist',  action='store', help='default=%(default)s', default='c,r' )
 group_opt.add_argument( '--nb',     action='store', help='default=%(default)s', default='64,100' )
 group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' )
 group_opt.add_argument( '--nt',     action='store', help='default=%(default)s', default='5,10,20' )
 group_opt.add_argument( '--np',     action='store', help='number of MPI processes; default=%(default)s', default='1' )
 group_opt.add_argument( '--grid',   action='store', help='use p-by-q MPI process grid', default='' )
-group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='r' )
+group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='' )
+group_opt.add_argument( '--dev-order',  action='store', help='default=%(default)s', default='r,c' )
 group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' )
 group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' )
 group_opt.add_argument( '--matrix',  action='store', help='default=%(default)s', default='' )
@@ -298,12 +298,12 @@
 origin = ' --origin ' + opts.origin if (opts.origin) else ''
 target = ' --target ' + opts.target if (opts.target) else ''
 la     = ' --lookahead ' + opts.lookahead if (opts.lookahead) else ''
-ddist  = ' --dev-dist ' + opts.dev_dist  if (opts.dev_dist)  else ''
 nb     = ' --nb '     + opts.nb     if (opts.nb)     else ''
 nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else ''
 nt     = ' --nt '     + opts.nt     if (opts.nt)     else ''
 grid   = ' --grid '   + opts.grid   if (opts.grid)   else ''
-grid_order = ' --grid-order ' + opts.grid_order  if (opts.grid_order)  else ''
+grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else ''
+dev_order  = ' --dev-order '  + opts.dev_order  if (opts.dev_order)  else ''
 repeat = ' --repeat ' + opts.repeat if (opts.repeat) else ''
 thresh = ' --thresh ' + opts.thresh if (opts.thresh) else ''
 matrix  = ' --matrix  ' + opts.matrix  if (opts.matrix)  else ''
@@ -316,10 +316,10 @@
 gen_no_nb = origin + target + grid + check + ref + tol + repeat
 gen_no_target =               grid + check + ref + tol + repeat + nb
 
-ge_matrix = ddist + grid_order
-sy_matrix = uplo + ddist + grid_order
-he_matrix = uplo + ddist + grid_order
-tr_matrix = uplo + diag + ddist + grid_order
+ge_matrix = grid_order + dev_order
+sy_matrix = grid_order + dev_order + uplo
+he_matrix = grid_order + dev_order + uplo
+tr_matrix = grid_order + dev_order + uplo + diag
 
 if (opts.matrix):
     gen += matrix
diff --git a/test/test.cc b/test/test.cc
index 16ba41ecf..3ba473dc2 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -356,8 +356,8 @@ Params::Params():
     method_lu     ("lu",     5, ParamType::List, slate::MethodLU::PartialPiv, str2methodLU, methodLU2str, "PartialPiv, CALU, NoPiv"),
     method_trsm   ("trsm",   4, ParamType::List, 0, str2methodTrsm,   methodTrsm2str,   "auto=auto, A=trsmA, B=trsmB"),
 
-    grid_order("go",      3, ParamType::List, slate::GridOrder::Col,   str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"),
-    dev_dist  ("dev-dist",9,    ParamType::List, slate::Dist::Row,        str2dist,     dist2str,     "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"),
+    grid_order("go",      3,    ParamType::List, slate::GridOrder::Col,   str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"),
+    dev_order ("do",      3,    ParamType::List, slate::GridOrder::Row,   str2grid_order, grid_order2str, "(do) Device grid order: c=Col, r=Row"),
 
     //         name,      w,    type,            default,                 char2enum,         enum2char,         enum2str,         help
     layout    ("layout",  6,    ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"),
@@ -463,6 +463,7 @@ Params::Params():
     lookahead.name("la", "lookahead");
     panel_threads.name("pt", "panel-threads");
     grid_order.name("go", "grid-order");
+    dev_order.name("do", "dev-order");
 
     // Change name for the methods to use less space in the stdout
     method_cholQR.name("cholQR", "method-cholQR");
diff --git a/test/test.hh b/test/test.hh
index 57c428443..8dec12510 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -27,11 +27,6 @@ enum class Origin : char {
     Devices = 'D',
 };
 
-enum class Dist : char {
-    Row = 'R',
-    Col = 'C',
-};
-
 } // namespace slate
 
 // -----------------------------------------------------------------------------
@@ -88,7 +83,7 @@ public:
     testsweeper::ParamEnum< slate::Method >         method_trsm;
 
     testsweeper::ParamEnum< slate::GridOrder >      grid_order;
-    testsweeper::ParamEnum< slate::Dist >           dev_dist;
+    testsweeper::ParamEnum< slate::GridOrder >      dev_order;
 
     // ----- test matrix parameters
     MatrixParams matrix;
@@ -284,32 +279,6 @@ void test_scale  (Params& params, bool run);
 void test_scale_row_col(Params& params, bool run);
 void test_set    (Params& params, bool run);
 
-// -----------------------------------------------------------------------------
-inline slate::Dist str2dist(const char* dist)
-{
-    std::string distribution_ = dist;
-    std::transform(
-        distribution_.begin(),
-        distribution_.end(),
-        distribution_.begin(), ::tolower);
-    if (distribution_ == "row" || distribution_ == "r")
-        return slate::Dist::Row;
-    else if (distribution_ == "col" || distribution_ == "c"
-                                    || distribution_ == "column")
-        return slate::Dist::Col;
-    else
-        throw slate::Exception("unknown distribution");
-}
-
-inline const char* dist2str(slate::Dist dist)
-{
-    switch (dist) {
-        case slate::Dist::Row: return "row";
-        case slate::Dist::Col: return "col";
-    }
-    return "?";
-}
-
 // -----------------------------------------------------------------------------
 inline slate::Origin str2origin(const char* origin)
 {
diff --git a/test/test_utils.hh b/test/test_utils.hh
index df336b077..bdae1d494 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -18,16 +18,16 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal
 {
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
-    slate::Dist dev_dist = params.dev_dist();
+    slate::GridOrder dev_order = params.dev_order();
     bool nonuniform_nb = params.nonuniform_nb() == 'y';
 
-    if (target != slate::Target::Devices && dev_dist == slate::Dist::Col) {
-        params.msg() = "skipping: dev_dist = Col applies only to target devices";
+    if (target != slate::Target::Devices && dev_order == slate::GridOrder::Col) {
+        params.msg() = "skipping: dev_order = Col applies only to target devices";
         return true;
     }
 
-    if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: dev_dist = Col tile not supported with origin=ScaLAPACK";
+    if (dev_order == slate::GridOrder::Col && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: dev_order = Col not supported with origin=ScaLAPACK";
         return true;
     }