diff --git a/GNUmakefile b/GNUmakefile
index 604394716..b71c10c0c 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -702,6 +702,7 @@ endif
 tester_src += \
         test/matrix_generator.cc \
         test/matrix_params.cc \
+        test/matrix_utils.cc \
         test/random.cc \
         test/test.cc \
         test/test_add.cc \
diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc
index 7b4eecd14..992287b73 100644
--- a/src/internal/internal_henorm.cc
+++ b/src/internal/internal_henorm.cc
@@ -196,12 +196,14 @@ void norm(
         // Sum tile results into local results.
         // Summing up local contributions only.
         std::fill_n(values, A.n(), 0.0);
-        int64_t nb0 = A.tileNb(0);
-        int64_t mb0 = A.tileMb(0);
-        // off-diagonal blocks
+
+        jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            int64_t nb = A.tileNb( j );
+
+            // off-diagonal blocks
+            int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
-                int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
                 if (A.tileIsLocal(i, j) &&
                     ( (  lower && i > j) ||
@@ -210,27 +212,26 @@ void norm(
                     // col sums
                     blas::axpy(
                         nb, 1.0,
-                        &tiles_sums[A.n()*i + j*nb0 ], 1,
-                        &values[j*nb0], 1);
+                        &tiles_sums[A.n()*i + jj ], 1,
+                        &values[jj], 1);
                     // row sums
                     blas::axpy(
                         mb, 1.0,
-                        &tiles_sums[A.m()*j + i*nb0 ], 1,
-                        &values[i*mb0], 1);
+                        &tiles_sums[A.m()*j + ii ], 1,
+                        &values[ii], 1);
                 }
+                ii += mb;
             }
-        }
 
-        // diagonal blocks
-        for (int64_t j = 0; j < A.nt(); ++j) {
-            int64_t nb = A.tileNb(j);
+            // diagonal blocks
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
                     nb, 1.0,
-                    &tiles_sums[A.n()*j + j*nb0 ], 1,
-                    &values[j*nb0], 1);
+                    &tiles_sums[A.n()*j + jj ], 1,
+                    &values[jj], 1);
             }
+            jj += nb;
         }
     }
     //---------
@@ -345,8 +346,6 @@ void norm(
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -374,7 +373,7 @@ void norm(
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
+            shared( vals_host_arrays, ijrange ) \
             firstprivate(device, layout, lower, queue_index, in_norm, ldv) \
             priority(priority)
         {
diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc
index 9f814b898..79a5ba291 100644
--- a/src/internal/internal_synorm.cc
+++ b/src/internal/internal_synorm.cc
@@ -143,7 +143,7 @@ void norm(
         int64_t jj = 0;
         #pragma omp taskgroup
         for (int64_t j = 0; j < A.nt(); ++j) {
-            // diagonal tile
+            // diagonal tiles
             if (j < A.mt() && A.tileIsLocal(j, j)) {
                 #pragma omp task slate_omp_default_none \
                     shared( A, tiles_sums ) \
@@ -190,17 +190,19 @@ void norm(
             }
             jj += A.tileNb(j);
         }
-        //end omp taskgroup
+        // end omp taskgroup
 
         // Sum tile results into local results.
         // Summing up local contributions only.
         std::fill_n(values, A.n(), 0.0);
-        int64_t nb0 = A.tileNb(0);
-        int64_t mb0 = A.tileMb(0);
-        // off-diagonal blocks
+
+        jj = 0;
         for (int64_t j = 0; j < A.nt(); ++j) {
+            int64_t nb = A.tileNb( j );
+
+            // off-diagonal blocks
+            int64_t ii = 0;
             for (int64_t i = 0; i < A.mt(); ++i) {
-                int64_t nb = A.tileNb(j);
                 int64_t mb = A.tileMb(i);
                 if (A.tileIsLocal(i, j) &&
                     ( (  lower && i > j) ||
@@ -209,27 +211,26 @@ void norm(
                     // col sums
                     blas::axpy(
                         nb, 1.0,
-                        &tiles_sums[A.n()*i + j*nb0 ], 1,
-                        &values[j*nb0], 1);
+                        &tiles_sums[A.n()*i + jj ], 1,
+                        &values[jj], 1);
                     // row sums
                     blas::axpy(
                         mb, 1.0,
-                        &tiles_sums[A.m()*j + i*nb0 ], 1,
-                        &values[i*mb0], 1);
+                        &tiles_sums[A.m()*j + ii ], 1,
+                        &values[ii], 1);
                 }
+                ii += mb;
             }
-        }
 
-        // diagonal blocks
-        for (int64_t j = 0; j < A.nt(); ++j) {
-            int64_t nb = A.tileNb(j);
+            // diagonal blocks
             if (A.tileIsLocal(j, j) ) {
                 // col sums
                 blas::axpy(
                     nb, 1.0,
-                    &tiles_sums[A.n()*j + j*nb0 ], 1,
-                    &values[j*nb0], 1);
+                    &tiles_sums[A.n()*j + jj ], 1,
+                    &values[jj], 1);
             }
+            jj += nb;
         }
     }
     //---------
@@ -301,6 +302,7 @@ void norm(
                 }
             }
         }
+        // end omp taskgroup
     }
 }
 
@@ -325,7 +327,8 @@ void norm(
 /// @ingroup norm_internal
 ///
 template <typename scalar_t>
-void norm(internal::TargetType<Target::Devices>,
+void norm(
+    internal::TargetType<Target::Devices>,
     Norm in_norm, NormScope scope, SymmetricMatrix<scalar_t>& A,
     blas::real_type<scalar_t>* values,
     int priority, int queue_index)
@@ -347,8 +350,6 @@ void norm(internal::TargetType<Target::Devices>,
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -376,7 +377,7 @@ void norm(internal::TargetType<Target::Devices>,
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
+            shared( vals_host_arrays, ijrange ) \
             firstprivate(device, lower, queue_index, in_norm, ldv, layout) \
             priority(priority)
         {
@@ -493,11 +494,6 @@ void norm(internal::TargetType<Target::Devices>,
     }
     // end omp taskgroup
 
-    for (int device = 0; device < A.num_devices(); ++device) {
-        blas::Queue* queue = A.compute_queue(device, queue_index);
-        blas::device_free(vals_dev_arrays[device], *queue);
-    }
-
     // Reduction over devices to local result.
     if (in_norm == Norm::Max) {
         *values = lapack::lange(in_norm,
diff --git a/src/internal/internal_trnorm.cc b/src/internal/internal_trnorm.cc
index 72e203c44..83cf429ae 100644
--- a/src/internal/internal_trnorm.cc
+++ b/src/internal/internal_trnorm.cc
@@ -365,8 +365,6 @@ void norm(
 
     std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());
 
-    std::vector<real_t*> vals_dev_arrays(A.num_devices());
-
     // devices_values used for max and Frobenius norms.
     std::vector<real_t> devices_values;
 
@@ -399,7 +397,7 @@ void norm(
     for (int device = 0; device < A.num_devices(); ++device) {
         #pragma omp task slate_omp_default_none \
             shared( A, devices_values ) \
-            shared( vals_host_arrays, vals_dev_arrays, irange, jrange ) \
+            shared( vals_host_arrays, irange, jrange ) \
             firstprivate(device, queue_index, in_norm, ldv, layout) \
             priority(priority)
         {
diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc
new file mode 100644
index 000000000..c751bb302
--- /dev/null
+++ b/test/matrix_utils.cc
@@ -0,0 +1,407 @@
+// Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include "matrix_utils.hh"
+
+using nb_func_t = std::function< int64_t(int64_t) >;
+using dist_func_t = std::function< int(std::tuple<int64_t, int64_t>) >;
+
+//------------------------------------------------------------------------------
+/// Shared logic of the allocate_test_* routines
+template <typename matrix_type, typename irregular_constructor_t,
+          typename regular_constructor_t, typename scalapack_constructor_t>
+static TestMatrix<matrix_type> allocate_test_shared(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params,
+    irregular_constructor_t construct_irregular,
+    regular_constructor_t construct_regular,
+    scalapack_constructor_t construct_scalapack)
+{
+    // Load params variables
+    int p = params.grid.m();
+    int q = params.grid.n();
+    int64_t nb = params.nb();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    slate::Origin origin = params.origin();
+    slate::GridOrder grid_order = params.grid_order();
+    slate::GridOrder dev_order = params.dev_order();
+
+    // The object to be returned
+    TestMatrix<matrix_type> matrix( m, n, nb, p, q, grid_order );
+
+    // Functions for nonuniform tile sizes or row device distribution
+    nb_func_t tileNb;
+    if (nonuniform_nb) {
+        tileNb = [nb](int64_t j) {
+            // for non-uniform tile size
+            return (j % 2 != 0 ? nb*2 : nb);
+        };
+    }
+    else {
+        // NB. we let BaseMatrix truncate the final tile length
+        // TrapezoidMatrix only takes 1 function for both dimensions (of different sizes)
+        tileNb = [nb](int64_t j) {
+            return nb;
+        };
+    }
+
+    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
+    int num_devices_ = blas::get_device_count();
+    auto tileDevice = slate::func::device_1d_grid( dev_order, p, num_devices_ );
+
+    // Setup matrix to test SLATE with
+    if (origin != slate::Origin::ScaLAPACK) {
+        // SLATE allocates CPU or GPU tiles.
+        slate::Target origin_target = origin2target( origin );
+        if (nonuniform_nb || dev_order == slate::GridOrder::Col) {
+            matrix.A = construct_irregular( tileNb, tileRank, tileDevice );
+        }
+        else {
+            matrix.A = construct_regular( nb, grid_order, p, q );
+        }
+        matrix.A.insertLocalTiles( origin_target );
+    }
+    else {
+        assert( !nonuniform_nb );
+        assert( dev_order == slate::GridOrder::Row );
+        // Create SLATE matrix from the ScaLAPACK layouts
+        matrix.A_data.resize( matrix.lld * matrix.nloc );
+        matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb,
+                                        grid_order, p, q );
+    }
+
+    // Setup reference matrix
+    if (ref_matrix) {
+        if (nonuniform_nb && nonuniform_ref) {
+            matrix.Aref = construct_irregular( tileNb, tileRank, tileDevice );
+            matrix.Aref.insertLocalTiles( slate::Target::Host );
+        }
+        else {
+            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
+            matrix.Aref = construct_scalapack( &matrix.Aref_data[0], matrix.lld, nb,
+                                           grid_order, p, q );
+        }
+    }
+
+    return matrix;
+}
+
+// -----------------------------------------------------------------------------
+/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] m
+///     The number of rows
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params)
+{
+    auto construct_irregular = [&] (nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::Matrix<scalar_t>( m, n, tileNb, tileNb,
+                                        tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::Matrix<scalar_t>( m, n, nb, nb, grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::Matrix<scalar_t>::fromScaLAPACK(
+                                            m, n, data, lld, nb, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::Matrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, m, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
+// Explicit instantiations.
+template
+TestMatrix<slate::Matrix<float>> allocate_test_Matrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::Matrix<double>> allocate_test_Matrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::Matrix<std::complex<float>>> allocate_test_Matrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::Matrix<std::complex<double>>> allocate_test_Matrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+//------------------------------------------------------------------------------
+/// Helper routine to avoid duplicating logic between HermitianMatrix
+/// and SymmetricMatrix
+///
+template <typename matrix_type>
+TestMatrix<matrix_type> allocate_test_HeSyMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
+{
+    using scalar_t = typename matrix_type::value_type;
+
+    slate::Uplo uplo = params.uplo();
+
+    auto construct_irregular = [&] (nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return matrix_type( uplo, n, tileNb,
+                            tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return matrix_type( uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return matrix_type::fromScaLAPACK( uplo, n, data, lld, nb,
+                                           grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<matrix_type>(
+                    ref_matrix, nonuniform_ref, n, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
+
+//------------------------------------------------------------------------------
+/// Allocates a HermitianMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::HermitianMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::HermitianMatrix<float>> allocate_test_HermitianMatrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<double>> allocate_test_HermitianMatrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<std::complex<float>>> allocate_test_HermitianMatrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::HermitianMatrix<std::complex<double>>> allocate_test_HermitianMatrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+//------------------------------------------------------------------------------
+/// Allocates a SymmetricMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
+{
+    return allocate_test_HeSyMatrix<slate::SymmetricMatrix<scalar_t>>(
+                ref_matrix, nonuniform_ref, n, params );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::SymmetricMatrix<float>> allocate_test_SymmetricMatrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<double>> allocate_test_SymmetricMatrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<std::complex<float>>> allocate_test_SymmetricMatrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::SymmetricMatrix<std::complex<double>>> allocate_test_SymmetricMatrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+
+//------------------------------------------------------------------------------
+/// Allocates a TriangularMatrix<scalar_t> and optionally a reference
+/// version for testing.
+///
+/// @param[in] ref_matrix
+///     Whether to allocate a reference matrix
+///
+/// @param[in] nonuniform_ref
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param[in] n
+///     The number of columns
+///
+/// @param[in] params
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    slate::Diag diag = params.diag();
+
+    auto construct_irregular = [&] (nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::TriangularMatrix<scalar_t>( uplo, diag, n, tileNb,
+                                                  tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TriangularMatrix<scalar_t>( uplo, diag, n, nb,
+                                                  grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
+                                            uplo, diag, n, data, lld, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::TriangularMatrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, n, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::TriangularMatrix<float>> allocate_test_TriangularMatrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<double>> allocate_test_TriangularMatrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<std::complex<float>>> allocate_test_TriangularMatrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TriangularMatrix<std::complex<double>>> allocate_test_TriangularMatrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+//------------------------------------------------------------------------------
+/// Allocates a TrapezoidMatrix<scalar_t> and a reference version for testing.
+///
+/// @param ref_matrix[in]
+///     Whether to allocate a reference matrix
+///
+/// @param nonuniform_ref[in]
+///     If params.nonuniform_nb(), whether to also allocate the reference matrix
+///     with non-uniform tiles.
+///
+/// @param m[in]
+///     The number of rows
+///
+/// @param n[in]
+///     The number of columns
+///
+/// @param params[in]
+///     The test params object which contains many of the key parameters
+///
+template <typename scalar_t>
+TestMatrix<slate::TrapezoidMatrix<scalar_t>> allocate_test_TrapezoidMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params)
+{
+    // Load params variables
+    slate::Uplo uplo = params.uplo();
+    slate::Diag diag = params.diag();
+
+    auto construct_irregular = [&] (nb_func_t tileNb,
+                                    dist_func_t tileRank, dist_func_t tileDevice)
+    {
+        return slate::TrapezoidMatrix<scalar_t>( uplo, diag, m, n, tileNb,
+                                                  tileRank, tileDevice, MPI_COMM_WORLD );
+    };
+    auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TrapezoidMatrix<scalar_t>( uplo, diag, m, n, nb,
+                                                  grid_order, p, q, MPI_COMM_WORLD );
+    };
+    auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb,
+                                    slate::GridOrder grid_order, int p, int q )
+    {
+        return slate::TrapezoidMatrix<scalar_t>::fromScaLAPACK(
+                                            uplo, diag, m, n, data, lld, nb,
+                                            grid_order, p, q, MPI_COMM_WORLD );
+    };
+
+    return allocate_test_shared<slate::TrapezoidMatrix<scalar_t>>(
+                    ref_matrix, nonuniform_ref, m, n, params,
+                     construct_irregular, construct_regular, construct_scalapack );
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+TestMatrix<slate::TrapezoidMatrix<float>> allocate_test_TrapezoidMatrix<float>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<double>> allocate_test_TrapezoidMatrix<double>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<std::complex<float>>> allocate_test_TrapezoidMatrix<std::complex<float>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template
+TestMatrix<slate::TrapezoidMatrix<std::complex<double>>> allocate_test_TrapezoidMatrix<std::complex<double>>(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh
index 7d7f44a96..b742b7ccf 100644
--- a/test/matrix_utils.hh
+++ b/test/matrix_utils.hh
@@ -7,8 +7,10 @@
 #define SLATE_MATRIX_UTILS_HH
 
 #include "slate/slate.hh"
+#include "test.hh"
 
 #include "scalapack_wrappers.hh"
+#include "grid_utils.hh"
 
 //------------------------------------------------------------------------------
 // Zero out B, then copy band matrix B from A.
@@ -214,6 +216,20 @@ class TestMatrix {
     using scalar_t = typename MatrixType::value_type;
 
 public:
+    TestMatrix() {}
+
+    TestMatrix(int64_t m_, int64_t n_, int64_t nb_,
+               int p_, int q_, slate::GridOrder grid_order_)
+        : m(m_), n(n_), nb(nb_), p(p_), q(q_), grid_order(grid_order_)
+    {
+        int mpi_rank, myrow, mycol;
+        MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
+        gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
+        this->mloc = num_local_rows_cols( m, nb, myrow, p );
+        this->nloc = num_local_rows_cols( n, nb, mycol, q );
+        this->lld  = blas::max( 1, mloc ); // local leading dimension of A
+    }
+
     // SLATE matrices
     MatrixType A;
     MatrixType Aref;
@@ -224,13 +240,22 @@ public:
 
     // ScaLAPACK configuration
     int64_t m, n, mloc, nloc, lld, nb;
+    int p, q;
+    slate::GridOrder grid_order;
 
     #ifdef SLATE_HAVE_SCALAPACK
-    void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) {
+    void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] )
+    {
         int64_t info;
         scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info);
         slate_assert(info == 0);
     }
+
+    void create_ScaLAPACK_context( blas_int* ictxt )
+    {
+        // Call free function version
+        ::create_ScaLAPACK_context( grid_order, p, q, ictxt );
+    }
     #endif
 };
 
@@ -244,109 +269,60 @@ inline void mark_params_for_test_Matrix(Params& params)
     params.nonuniform_nb();
     params.origin();
     params.grid_order();
+    params.dev_order();
 }
 
 //------------------------------------------------------------------------------
-/// Allocates a Matrix<scalar_t> and optionally a reference version for testing.
-///
-/// @param[in] ref_matrix
-///     Whether to allocate a reference matrix
-///
-/// @param[in] nonuniform_ref
-///     If params.nonuniform_nb(), whether to also allocate the reference matrix
-///     with non-uniform tiles.
-///
-/// @param[in] m
-///     The number of rows
-///
-/// @param[in] n
-///     The number of columns
-///
-/// @param[in] params
-///     The test params object which contains many of the key parameters
-///
-template <typename scalar_t>
-TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
-        bool ref_matrix,
-        bool nonuniform_ref,
-        int64_t m,
-        int64_t n,
-        Params& params)
+/// Marks the paramters used by allocate_test_HermitianMatrix
+inline void mark_params_for_test_HermitianMatrix(Params& params)
 {
-    // Load params variables
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
-    int64_t nb = params.nb();
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
-    slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
-
-    // The object to be returned
-    TestMatrix<slate::Matrix<scalar_t>> matrix;
-    matrix.m = m;
-    matrix.n = n;
-
-    // ScaLAPACK variables for reference matrix
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank );
-    gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol );
-    matrix.nb = nb;
-    matrix.mloc = num_local_rows_cols( m, nb, myrow, p );
-    matrix.nloc = num_local_rows_cols( n, nb, mycol, q );
-    matrix.lld  = blas::max( 1, matrix.mloc ); // local leading dimension of A
-
-    // Functions for nonuniform tile sizes.
-    // Odd-numbered tiles are 2*nb, even-numbered tiles are nb.
-    std::function< int64_t (int64_t j) >
-    tileNb = [nb](int64_t j) {
-        return (j % 2 != 0 ? nb*2 : nb);
-    };
-    auto tileRank = slate::func::process_2d_grid( grid_order, p, q );
-    int num_devices_ = blas::get_device_count();
-    auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
-                                                   p, num_devices_ );
-
-    // Setup matrix to test SLATE with
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target( origin );
-        if (nonuniform_nb) {
-            params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) )
-                         + ", "             + std::to_string( tileNb( 1 ) );
-            matrix.A = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
-                                                tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            matrix.A = slate::Matrix<scalar_t>( m, n, nb, nb,
-                                                grid_order, p, q, MPI_COMM_WORLD );
-        }
-        matrix.A.insertLocalTiles( origin_target );
-    }
-    else {
-        assert( !nonuniform_nb );
-        // Create SLATE matrix from the ScaLAPACK layouts
-        matrix.A_data.resize( matrix.lld * matrix.nloc );
-        matrix.A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    m, n, &matrix.A_data[0], matrix.lld, nb, nb,
-                    grid_order, p, q, MPI_COMM_WORLD );
-    }
+    params.uplo();
+    mark_params_for_test_Matrix( params );
+}
 
-    // Setup reference matrix
-    if (ref_matrix) {
-        if (nonuniform_nb && nonuniform_ref) {
-            matrix.Aref = slate::Matrix<scalar_t>( m, n, tileNb, tileNb, tileRank,
-                                                   tileDevice, MPI_COMM_WORLD );
-            matrix.Aref.insertLocalTiles( slate::Target::Host );
-        }
-        else {
-            matrix.Aref_data.resize( matrix.lld * matrix.nloc );
-            matrix.Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       m, n, &matrix.Aref_data[0], matrix.lld, nb, nb,
-                       grid_order, p, q, MPI_COMM_WORLD );
-        }
-    }
+//------------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_SymmetricMatrix
+inline void mark_params_for_test_SymmetricMatrix(Params& params)
+{
+    mark_params_for_test_HermitianMatrix( params );
+}
+
+//------------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_TriangularMatrix
+inline void mark_params_for_test_TriangularMatrix(Params& params)
+{
+    params.uplo();
+    params.diag();
+    mark_params_for_test_Matrix( params );
+}
 
-    return matrix;
+// -----------------------------------------------------------------------------
+/// Marks the paramters used by allocate_test_TrapezoidMatrix
+inline void mark_params_for_test_TrapezoidMatrix(Params& params)
+{
+    params.uplo();
+    params.diag();
+    mark_params_for_test_Matrix( params );
 }
 
+template <typename scalar_t>
+TestMatrix<slate::Matrix<scalar_t>> allocate_test_Matrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
+template <typename scalar_t>
+TestMatrix<slate::HermitianMatrix<scalar_t>> allocate_test_HermitianMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template <typename scalar_t>
+TestMatrix<slate::SymmetricMatrix<scalar_t>> allocate_test_SymmetricMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template <typename scalar_t>
+TestMatrix<slate::TriangularMatrix<scalar_t>> allocate_test_TriangularMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params);
+
+template <typename scalar_t>
+TestMatrix<slate::TrapezoidMatrix<scalar_t>> allocate_test_TrapezoidMatrix(
+    bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params);
+
 #endif // SLATE_MATRIX_UTILS_HH
diff --git a/test/run_tests.py b/test/run_tests.py
index 4561e217f..574ecfc10 100755
--- a/test/run_tests.py
+++ b/test/run_tests.py
@@ -121,12 +121,13 @@
 group_opt.add_argument( '--origin', action='store', help='default=%(default)s', default='s' )
 group_opt.add_argument( '--target', action='store', help='default=%(default)s', default='t' )
 group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' )
-group_opt.add_argument( '--dev-dist',  action='store', help='default=%(default)s', default='c,r' )
 group_opt.add_argument( '--nb',     action='store', help='default=%(default)s', default='64,100' )
 group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' )
 group_opt.add_argument( '--nt',     action='store', help='default=%(default)s', default='5,10,20' )
 group_opt.add_argument( '--np',     action='store', help='number of MPI processes; default=%(default)s', default='1' )
 group_opt.add_argument( '--grid',   action='store', help='use p-by-q MPI process grid', default='' )
+group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='' )
+group_opt.add_argument( '--dev-order',  action='store', help='default=%(default)s', default='r,c' )
 group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' )
 group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' )
 group_opt.add_argument( '--matrix',  action='store', help='default=%(default)s', default='' )
@@ -297,11 +298,12 @@
 origin = ' --origin ' + opts.origin if (opts.origin) else ''
 target = ' --target ' + opts.target if (opts.target) else ''
 la     = ' --lookahead ' + opts.lookahead if (opts.lookahead) else ''
-ddist  = ' --dev-dist  ' + opts.dev_dist  if (opts.dev_dist)  else ''
 nb     = ' --nb '     + opts.nb     if (opts.nb)     else ''
 nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else ''
 nt     = ' --nt '     + opts.nt     if (opts.nt)     else ''
 grid   = ' --grid '   + opts.grid   if (opts.grid)   else ''
+grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else ''
+dev_order  = ' --dev-order '  + opts.dev_order  if (opts.dev_order)  else ''
 repeat = ' --repeat ' + opts.repeat if (opts.repeat) else ''
 thresh = ' --thresh ' + opts.thresh if (opts.thresh) else ''
 matrix  = ' --matrix  ' + opts.matrix  if (opts.matrix)  else ''
@@ -314,6 +316,11 @@
 gen_no_nb = origin + target + grid + check + ref + tol + repeat
 gen_no_target =               grid + check + ref + tol + repeat + nb
 
+ge_matrix = grid_order + dev_order
+sy_matrix = grid_order + dev_order + uplo
+he_matrix = grid_order + dev_order + uplo
+tr_matrix = grid_order + dev_order + uplo + diag
+
 if (opts.matrix):
     gen += matrix
 
@@ -348,67 +355,67 @@ def filter_csv( values, csv ):
     cmds += [
     [ 'gbmm',  gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ],
 
-    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
-    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
-    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ],
+    [ 'gemm',  gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
+    [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
+    [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ],
 
-    [ 'hemm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
+    [ 'hemm',  gen + dtype         + la + side + he_matrix     + mn + ab + matrixBC ],
     # todo: hemmA GPU support
-    [ 'hemmA', gen_no_target + dtype + la + side + uplo   + mn + ab + matrixBC ],
-    [ 'hemmC', gen + dtype         + la + side + uplo     + mn + ab + matrixBC],
+    [ 'hemmA', gen_no_target + dtype + la + side + he_matrix   + mn + ab + matrixBC ],
+    [ 'hemmC', gen + dtype         + la + side + he_matrix     + mn + ab + matrixBC],
 
     [ 'hbmm',  gen + dtype         + la + side + uplo     + mn + ab + kd + matrixBC ],
 
-    [ 'herk',  gen + dtype_real    + la + uplo + trans    + mn + ab + matrixC ],
-    [ 'herk',  gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_real    + la + he_matrix + trans    + mn + ab + matrixC ],
+    [ 'herk',  gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixC ],
 
-    [ 'her2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixBC ],
-    [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_real    + la + he_matrix + trans    + mn + ab + matrixBC ],
+    [ 'her2k', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixBC ],
 
-    [ 'symm',  gen + dtype         + la + side + uplo     + mn + ab + matrixBC ],
+    [ 'symm',  gen + dtype         + la + side + sy_matrix     + mn + ab + matrixBC ],
 
-    [ 'syr2k', gen + dtype_real    + la + uplo + trans    + mn + ab + matrixC ],
-    [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_real    + la + sy_matrix + trans    + mn + ab + matrixC ],
+    [ 'syr2k', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixC ],
 
-    [ 'syrk',  gen + dtype_real    + la + uplo + trans    + mn + ab + matrixBC ],
-    [ 'syrk',  gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_real    + la + sy_matrix + trans    + mn + ab + matrixBC ],
+    [ 'syrk',  gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixBC ],
 
     # todo: tbsm fails for nb=8 or 16 with --quick.
     [ 'tbsm',  gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ],
 
-    [ 'trmm',  gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
+    [ 'trmm',  gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
 
-    [ 'trsm',  gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
-    [ 'trsmA', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
-    [ 'trsmB', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ],
+    [ 'trsm',  gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
+    [ 'trsmA', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
+    [ 'trsmB', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ],
     ]
 
 # LU
 if (opts.lu):
     cmds += [
-    [ 'gesv',         gen + dtype + la + n + thresh + nonuniform_nb ],
-    [ 'gesv_tntpiv',  gen + dtype + la + n ],
-    [ 'gesv_nopiv',   gen + dtype + la + n + nonuniform_nb
+    [ 'gesv',         gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ],
+    [ 'gesv_tntpiv',  gen + dtype + la + n + ge_matrix ],
+    [ 'gesv_nopiv',   gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
     # todo: mn
-    [ 'getrf',        gen + dtype + la + n + thresh + nonuniform_nb ],
-    [ 'getrf_tntpiv', gen + dtype + la + n ],
-    [ 'getrf_nopiv',  gen + dtype + la + n + nonuniform_nb
+    [ 'getrf',        gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ],
+    [ 'getrf_tntpiv', gen + dtype + la + n + ge_matrix ],
+    [ 'getrf_nopiv',  gen + dtype + la + n + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
-    [ 'getrs',        gen + dtype + la + n + trans + thresh + nonuniform_nb ],
-    [ 'getrs_tntpiv', gen + dtype + la + n + trans ],
-    [ 'getrs_nopiv',  gen + dtype + la + n + trans + nonuniform_nb
+    [ 'getrs',        gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ],
+    [ 'getrs_tntpiv', gen + dtype + la + n + trans + ge_matrix ],
+    [ 'getrs_nopiv',  gen + dtype + la + n + trans + ge_matrix + nonuniform_nb
                       + ' --matrix rand_dominant' ],
 
     [ 'getri',    gen + dtype + la + n ],
     [ 'getriOOP', gen + dtype + la + n ],
     #[ 'gerfs', gen + dtype + la + n + trans ],
     #[ 'geequ', gen + dtype + la + n ],
-    [ 'gesv_mixed',   gen + dtype_double + la + n + nonuniform_nb ],
-    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ],
-    [ 'gesv_rbt', gen + dtype + la + n ],
+    [ 'gesv_mixed',   gen + dtype_double + la + n + ge_matrix + nonuniform_nb ],
+    [ 'gesv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + ge_matrix + nonuniform_nb ],
+    [ 'gesv_rbt', gen + dtype + la + n + ge_matrix ],
     ]
 
 # LU banded
@@ -424,14 +431,14 @@ def filter_csv( values, csv ):
 # Cholesky
 if (opts.chol):
     cmds += [
-    [ 'posv',  gen + dtype + la + n + uplo ],
-    [ 'potrf', gen + dtype + la + n + uplo + ddist ],
-    [ 'potrs', gen + dtype + la + n + uplo ],
-    [ 'potri', gen + dtype + la + n + uplo ],
+    [ 'posv',  gen + dtype + la + n + he_matrix ],
+    [ 'potrf', gen + dtype + la + n + he_matrix ],
+    [ 'potrs', gen + dtype + la + n + he_matrix ],
+    [ 'potri', gen + dtype + la + n ],
     #[ 'porfs', gen + dtype + la + n + uplo ],
     #[ 'poequ', gen + dtype + la + n ],  # only diagonal elements (no uplo)
-    [ 'posv_mixed', gen + dtype_double + la + n + uplo ],
-    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + uplo + ' --nrhs 1' ],
+    [ 'posv_mixed', gen + dtype_double + la + n + he_matrix ],
+    [ 'posv_mixed_gmres',  gen + dtype_double + la + n + ' --nrhs 1' + he_matrix ],
     [ 'trtri', gen + dtype + la + n + uplo + diag ],
     ]
 
@@ -585,9 +592,9 @@ def filter_csv( values, csv ):
 # svd
 if (opts.svd):
     if ('n' in jobu):
-        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' ]]
+        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' + ge_matrix ]]
     if ('v' in jobu):
-        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' ]]
+        cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' + ge_matrix ]]
 
     cmds += [
     # todo: mn (wide), nb, jobu, jobvt
@@ -600,10 +607,10 @@ def filter_csv( values, csv ):
 # norms
 if (opts.norms):
     cmds += [
-    [ 'genorm', gen + dtype + mn + norm ],
-    [ 'henorm', gen + dtype + n  + norm + uplo ],
-    [ 'synorm', gen + dtype + n  + norm + uplo ],
-    [ 'trnorm', gen + dtype + mn + norm + uplo + diag ],
+    [ 'genorm', gen + dtype + mn + norm + nonuniform_nb + ge_matrix ],
+    [ 'henorm', gen + dtype + n  + norm + nonuniform_nb + he_matrix ],
+    [ 'synorm', gen + dtype + n  + norm + nonuniform_nb + sy_matrix ],
+    [ 'trnorm', gen + dtype + mn + norm + nonuniform_nb + tr_matrix ],
 
     # Banded
     [ 'gbnorm', gen + dtype + mn  + kl + ku + norm ],
@@ -628,31 +635,31 @@ def filter_csv( values, csv ):
 # aux
 if (opts.aux):
     cmds += [
-    [ 'add',    gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + uplo ],
-
-    [ 'copy',   gen + dtype + mn      + nonuniform_nb        ],
-    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + uplo ],
-    [ 'trcopy', gen + dtype + n       + nonuniform_nb + uplo ],
-    [ 'sycopy', gen + dtype + n       + nonuniform_nb + uplo ],
-    [ 'hecopy', gen + dtype + n       + nonuniform_nb + uplo ],
-
-    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + uplo ],
-
-    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ],
-
-    [ 'set',    gen + dtype + mn + ab + nonuniform_nb        ],
-    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + uplo ],
-    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
-    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
-    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + uplo ],
+    [ 'add',    gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzadd',  gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'tradd',  gen + dtype + n  + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syadd',  gen + dtype + n  + ab + nonuniform_nb + sy_matrix        ],
+    [ 'headd',  gen + dtype + n  + ab + nonuniform_nb + he_matrix        ],
+
+    [ 'copy',   gen + dtype + mn      + nonuniform_nb + ge_matrix        ],
+    [ 'tzcopy', gen + dtype + mn      + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trcopy', gen + dtype + n       + nonuniform_nb + ge_matrix + uplo ],
+    [ 'sycopy', gen + dtype + n       + nonuniform_nb + sy_matrix        ],
+    [ 'hecopy', gen + dtype + n       + nonuniform_nb + he_matrix        ],
+
+    [ 'scale',   gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trscale', gen + dtype + n  + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syscale', gen + dtype + n  + ab + nonuniform_nb + sy_matrix        ],
+    [ 'hescale', gen + dtype + n  + ab + nonuniform_nb + he_matrix        ],
+
+    [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ge_matrix ],
+
+    [ 'set',    gen + dtype + mn + ab + nonuniform_nb + ge_matrix        ],
+    [ 'tzset',  gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'trset',  gen + dtype +  n + ab + nonuniform_nb + ge_matrix + uplo ],
+    [ 'syset',  gen + dtype +  n + ab + nonuniform_nb + sy_matrix        ],
+    [ 'heset',  gen + dtype +  n + ab + nonuniform_nb + he_matrix        ],
     ]
 
 # ------------------------------------------------------------------------------
diff --git a/test/scalapack_support_routines.hh b/test/scalapack_support_routines.hh
deleted file mode 100644
index 458dc7e42..000000000
--- a/test/scalapack_support_routines.hh
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2009-2023, University of Tennessee. All rights reserved.
-// Copyright (c) 2010,      University of Denver, Colorado.
-// SPDX-License-Identifier: BSD-3-Clause
-// This program is free software: you can redistribute it and/or modify it under
-// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
-
-#ifndef SLATE_SCALAPACK_SUPPORT_HH
-#define SLATE_SCALAPACK_SUPPORT_HH
-
-#include <complex>
-
-#include "scalapack_wrappers.hh"
-
-//------------------------------------------------------------------------------
-// Matrix generation
-#define Rnd64_A  6364136223846793005ULL
-#define Rnd64_C  1ULL
-#define RndF_Mul 5.4210108624275222e-20f
-#define RndD_Mul 5.4210108624275222e-20
-
-typedef unsigned long long ull;
-
-static inline ull Rnd64_jump(ull n, ull seed)
-{
-    ull a_k, c_k, ran;
-    int64_t i;
-
-    a_k = Rnd64_A;
-    c_k = Rnd64_C;
-    ran = seed;
-    for (i = 0; n; n >>= 1, ++i) {
-        if (n & 1)
-            ran = a_k*ran + c_k;
-        c_k *= (a_k + 1);
-        a_k *= a_k;
-    }
-    return ran;
-}
-
-template<typename scalar_t>
-static inline void CORE_plrnt(int64_t m, int64_t n, scalar_t* A, int64_t lda,
-                              int64_t bigM, int64_t m0, int64_t n0, ull seed)
-{
-    scalar_t* tmp = A;
-    int64_t i, j;
-    ull ran, jump;
-
-    jump = (ull)m0 + (ull)n0*(ull)bigM;
-    for (j = 0; j < n; ++j) {
-        ran = Rnd64_jump(jump, (ull)seed);
-        for (i = 0; i < m; ++i) {
-            *tmp = 0.5f - ran*RndF_Mul;
-            ran  = Rnd64_A*ran + Rnd64_C;
-            ++tmp;
-        }
-        tmp  += lda - i;
-        jump += bigM;
-    }
-}
-
-template<typename scalar_t>
-static inline void CORE_plghe(scalar_t bump, int64_t m, int64_t n, scalar_t* A, int64_t lda,
-                              int64_t gM, int64_t m0, int64_t n0, ull seed)
-{
-    scalar_t* tmp = A;
-    int64_t i, j;
-    ull ran, jump;
-
-    jump = (ull)m0 + (ull)n0*(ull)gM;
-    /* Tile diagonal */
-    if (m0 == n0) {
-        for (j = 0; j < n; ++j) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (i = j; i < m; ++i) {
-                *tmp = 0.5f - ran*RndF_Mul;
-                ran  = Rnd64_A*ran + Rnd64_C;
-                ++tmp;
-            }
-            tmp  += (lda - i + j + 1);
-            jump += gM + 1;
-        }
-        for (j = 0; j < n; ++j) {
-            A[j + j*lda] += bump;
-
-            for (i = 0; i < j; ++i)
-                A[lda*j + i] = A[lda*i + j];
-        }
-    }
-
-    /* Lower part */
-    else if (m0 > n0) {
-        for (j = 0; j < n; ++j) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (i = 0; i < m; ++i) {
-                *tmp = 0.5f - ran*RndF_Mul;
-                ran  = Rnd64_A*ran + Rnd64_C;
-                ++tmp;
-            }
-            tmp  += (lda - i);
-            jump += gM;
-        }
-    }
-
-    /* Upper part */
-    else if (m0 < n0) {
-        /* Overwrite jump */
-        jump = (ull)n0 + (ull)m0*(ull)gM;
-
-        for (i = 0; i < m; ++i) {
-            ran = Rnd64_jump(jump, seed);
-
-            for (j = 0; j < n; ++j) {
-                A[j*lda + i] = 0.5f - ran*RndF_Mul;
-                ran = Rnd64_A*ran + Rnd64_C;
-            }
-            jump += gM;
-        }
-    }
-}
-
-template<typename scalar_t>
-static void scalapack_pplrnt(scalar_t* A,
-                             int64_t m, int64_t n,
-                             int64_t mb, int64_t nb,
-                             blas_int myrow, blas_int mycol,
-                             blas_int nprow, blas_int npcol,
-                             int64_t lldA,
-                             int64_t seed)
-{
-    blas_int idum1, idum2, iloc, jloc, i0 = 0;
-    blas_int tempm, tempn;
-    scalar_t* Ab;
-    blas_int mb_ = blas_int( mb );
-    blas_int nb_ = blas_int( nb );
-
-    // #pragma omp parallel for
-    for (blas_int i = 1; i <= m; i += mb) {
-        for (blas_int j = 1; j <= n; j += nb) {
-            if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) &&
-                (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) {
-                iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow);
-                jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol);
-
-                Ab =  &A[(jloc - 1)*lldA + (iloc - 1) ];
-                tempm = (m - i + 1) > mb ? mb : (m - i + 1);
-                tempn = (n - j + 1) > nb ? nb : (n - j + 1);
-                CORE_plrnt(tempm, tempn, Ab, lldA,
-                           m, (i - 1), (j - 1), seed);
-            }
-        }
-    }
-}
-
-
-template<typename scalar_t>
-static void scalapack_pplghe(scalar_t* A,
-                             int64_t m, int64_t n,
-                             int64_t mb, int64_t nb,
-                             blas_int myrow, blas_int mycol,
-                             blas_int nprow, blas_int npcol,
-                             int64_t lldA,
-                             int64_t seed)
-{
-    blas_int idum1, idum2, iloc, jloc, i0 = 0;
-    int64_t tempm, tempn;
-    scalar_t* Ab;
-    scalar_t bump = (scalar_t)m;
-    blas_int mb_ = blas_int( mb );
-    blas_int nb_ = blas_int( nb );
-
-    // #pragma omp parallel for
-    for (blas_int i = 1; i <= m; i += mb) {
-        for (blas_int j = 1; j <= n; j += nb) {
-            if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) &&
-                (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) {
-                iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow);
-                jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol);
-
-                Ab =  &A[(jloc - 1)*lldA + (iloc - 1) ];
-                tempm = (m - i + 1) > mb ? mb : (m - i + 1);
-                tempn = (n - j + 1) > nb ? nb : (n - j + 1);
-                CORE_plghe(bump, tempm, tempn, Ab, lldA,
-                           m, (i - 1), (j - 1), seed);
-            }
-        }
-    }
-}
-
-#endif // SLATE_SCALAPACK_SUPPORT_HH
diff --git a/test/test.cc b/test/test.cc
index 078df162a..3ba473dc2 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -356,8 +356,8 @@ Params::Params():
     method_lu     ("lu",     5, ParamType::List, slate::MethodLU::PartialPiv, str2methodLU, methodLU2str, "PartialPiv, CALU, NoPiv"),
     method_trsm   ("trsm",   4, ParamType::List, 0, str2methodTrsm,   methodTrsm2str,   "auto=auto, A=trsmA, B=trsmB"),
 
-    grid_order("go",      3, ParamType::List, slate::GridOrder::Col,   str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"),
-    dev_dist  ("dev-dist",9,    ParamType::List, slate::Dist::Col,        str2dist,     dist2str,     "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"),
+    grid_order("go",      3,    ParamType::List, slate::GridOrder::Col,   str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"),
+    dev_order ("do",      3,    ParamType::List, slate::GridOrder::Row,   str2grid_order, grid_order2str, "(do) Device grid order: c=Col, r=Row"),
 
     //         name,      w,    type,            default,                 char2enum,         enum2char,         enum2str,         help
     layout    ("layout",  6,    ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"),
@@ -463,6 +463,7 @@ Params::Params():
     lookahead.name("la", "lookahead");
     panel_threads.name("pt", "panel-threads");
     grid_order.name("go", "grid-order");
+    dev_order.name("do", "dev-order");
 
     // Change name for the methods to use less space in the stdout
     method_cholQR.name("cholQR", "method-cholQR");
diff --git a/test/test.hh b/test/test.hh
index b0d48e0f3..8dec12510 100644
--- a/test/test.hh
+++ b/test/test.hh
@@ -21,15 +21,10 @@
 // -----------------------------------------------------------------------------
 namespace slate {
 
-enum class Origin {
-    Host,
-    ScaLAPACK,
-    Devices,
-};
-
-enum class Dist {
-    Row,
-    Col,
+enum class Origin : char {
+    Host = 'H',
+    ScaLAPACK = 'S',
+    Devices = 'D',
 };
 
 } // namespace slate
@@ -88,7 +83,7 @@ public:
     testsweeper::ParamEnum< slate::Method >         method_trsm;
 
     testsweeper::ParamEnum< slate::GridOrder >      grid_order;
-    testsweeper::ParamEnum< slate::Dist >           dev_dist;
+    testsweeper::ParamEnum< slate::GridOrder >      dev_order;
 
     // ----- test matrix parameters
     MatrixParams matrix;
@@ -284,32 +279,6 @@ void test_scale  (Params& params, bool run);
 void test_scale_row_col(Params& params, bool run);
 void test_set    (Params& params, bool run);
 
-// -----------------------------------------------------------------------------
-inline slate::Dist str2dist(const char* dist)
-{
-    std::string distribution_ = dist;
-    std::transform(
-        distribution_.begin(),
-        distribution_.end(),
-        distribution_.begin(), ::tolower);
-    if (distribution_ == "row" || distribution_ == "r")
-        return slate::Dist::Row;
-    else if (distribution_ == "col" || distribution_ == "c"
-                                    || distribution_ == "column")
-        return slate::Dist::Col;
-    else
-        throw slate::Exception("unknown distribution");
-}
-
-inline const char* dist2str(slate::Dist dist)
-{
-    switch (dist) {
-        case slate::Dist::Row: return "row";
-        case slate::Dist::Col: return "col";
-    }
-    return "?";
-}
-
 // -----------------------------------------------------------------------------
 inline slate::Origin str2origin(const char* origin)
 {
diff --git a/test/test_add.cc b/test/test_add.cc
index 3e51600b3..0596c73f8 100644
--- a/test/test_add.cc
+++ b/test/test_add.cc
@@ -7,10 +7,9 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
@@ -49,8 +48,6 @@ void test_add_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -67,6 +64,11 @@ void test_add_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
@@ -133,7 +135,7 @@ void test_add_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
@@ -195,7 +197,7 @@ void test_add_work(Params& params, bool run)
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( trans );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_bdsqr.cc b/test/test_bdsqr.cc
index 8e4ac041f..b3ffa96f0 100644
--- a/test/test_bdsqr.cc
+++ b/test/test_bdsqr.cc
@@ -7,7 +7,6 @@
 #include "blas.hh"
 #include "test.hh"
 #include "print_matrix.hh"
-#include "scalapack_support_routines.hh"
 #include "internal/internal.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_copy.cc b/test/test_copy.cc
index 84721bb9d..d7b3bf0fc 100644
--- a/test/test_copy.cc
+++ b/test/test_copy.cc
@@ -7,10 +7,9 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
@@ -47,8 +46,6 @@ void test_copy_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -65,6 +62,11 @@ void test_copy_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
@@ -135,7 +137,7 @@ void test_copy_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
 
@@ -183,7 +185,7 @@ void test_copy_work(Params& params, bool run)
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( A_norm );
             SLATE_UNUSED( B_norm );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_gbnorm.cc b/test/test_gbnorm.cc
index 883d07aff..7ac9a510d 100644
--- a/test/test_gbnorm.cc
+++ b/test/test_gbnorm.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_gecondest.cc b/test/test_gecondest.cc
index 9bb9badad..8f365bee8 100644
--- a/test/test_gecondest.cc
+++ b/test/test_gecondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gelqf.cc b/test/test_gelqf.cc
index 915d67455..68a84747c 100644
--- a/test/test_gelqf.cc
+++ b/test/test_gelqf.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gels.cc b/test/test_gels.cc
index 9ad6a3bbc..2d20e213a 100644
--- a/test/test_gels.cc
+++ b/test/test_gels.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gemm.cc b/test/test_gemm.cc
index 746e2f74d..8add9e9c5 100644
--- a/test/test_gemm.cc
+++ b/test/test_gemm.cc
@@ -7,11 +7,12 @@
 #include "test.hh"
 #include "blas/flops.hh"
 #include "print_matrix.hh"
+
 #include "grid_utils.hh"
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
@@ -44,19 +45,15 @@ void test_gemm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
     int64_t k = params.dim.k();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t lookahead = params.lookahead();
     bool ref_only = params.ref() == 'o';
     slate::Norm norm = params.norm();
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     slate::Target target = params.target();
     slate::Origin origin = params.origin();
-    slate::GridOrder grid_order = params.grid_order();
     slate::Method method_gemm = params.method_gemm();
     params.matrix.mark();
     params.matrixB.mark();
@@ -77,14 +74,10 @@ void test_gemm_work(Params& params, bool run)
     if (! run)
         return;
 
-    #ifndef SLATE_HAVE_SCALAPACK
-        // Can run ref only when we have ScaLAPACK.
-        if (ref) {
-            if (mpi_rank == 0)
-                printf( "ScaLAPACK not available\n" );
-            ref = false;
-        }
-    #endif
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
 
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
@@ -116,9 +109,14 @@ void test_gemm_work(Params& params, bool run)
     slate::generate_matrix(params.matrixB, B);
     slate::generate_matrix(params.matrixC, C);
 
-    // if reference run is required, copy test data.
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
     if (ref) {
         slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
     }
 
     if (transA == slate::Op::Trans)
@@ -135,14 +133,6 @@ void test_gemm_work(Params& params, bool run)
     slate_assert(B.nt() == C.nt());
     slate_assert(A.nt() == B.mt());
 
-    // If reference run is required, record norms to be used in the check/ref.
-    real_t A_norm=0, B_norm=0, C_orig_norm=0;
-    if (ref) {
-        A_norm = slate::norm(norm, A);
-        B_norm = slate::norm(norm, B);
-        C_orig_norm = slate::norm(norm, Cref);
-    }
-
     // If check run, perform first half of SLATE residual check.
     TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
@@ -224,14 +214,10 @@ void test_gemm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
diff --git a/test/test_genorm.cc b/test/test_genorm.cc
index 5437d9429..41c03cee5 100644
--- a/test/test_genorm.cc
+++ b/test/test_genorm.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -39,12 +40,17 @@ void test_genorm_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -52,41 +58,26 @@ void test_genorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( ref_copy, false, m, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (check || ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-    }
+    slate::generate_matrix(params.matrix, A);
 
-    slate::Matrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix(params.matrix, A);
-
     std::vector<real_t> values;
     if (scope == slate::NormScope::Columns) {
         values.resize(A.n());
@@ -136,33 +127,21 @@ void test_genorm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-            slate_assert(info == 0);
-
-            if (origin != slate::Origin::ScaLAPACK) {
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
                 copy(A, &A_data[0], A_desc);
             }
 
             // allocate work space
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
+            std::vector<real_t> worklange(std::max(A_alloc.mloc, A_alloc.nloc));
 
             // (Sca)LAPACK norms don't support trans; map One <=> Inf norm.
             slate::Norm op_norm = norm;
@@ -199,10 +178,6 @@ void test_genorm_work(Params& params, bool run)
             }
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lange(
-            //    op_norm,
-            //    m, n, &A_data[0], lldA);
-
             if (scope == slate::NormScope::Matrix) {
                 // difference between norms
                 error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
@@ -216,7 +191,7 @@ void test_genorm_work(Params& params, bool run)
                     error /= sqrt(m*n);
                 }
 
-                if (verbose && mpi_rank == 0) {
+                if (verbose && A.mpiRank() == 0) {
                     printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                            A_norm, A_norm_ref, A_norm_ref - A_norm, error);
                 }
@@ -238,107 +213,113 @@ void test_genorm_work(Params& params, bool run)
 
             //---------- extended tests
             if (extended && scope == slate::NormScope::Matrix) {
-                // seed all MPI processes the same
-                srand(1234);
-
-                // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-                // up to 64 tiles total.
-                // Indices may be out-of-bounds if mt or nt is small, so check in loops.
-                int64_t mt = A.mt();
-                int64_t nt = A.nt();
-                std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
-                std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-                for (size_t k = 0; k < 4; ++k) {
-                    i_indices.insert(rand() % mt);
-                    j_indices.insert(rand() % nt);
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
                 }
-                for (auto j : j_indices) {
-                    if (j < 0 || j >= nt)
-                        continue;
-                    int64_t jb = std::min(n - j*nb, nb);
-                    slate_assert(jb == A.tileNb(j));
-
-                    for (auto i : i_indices) {
-                        if (i < 0 || i >= mt)
+                else {
+
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if mt or nt is small, so check in loops.
+                    int64_t mt = A.mt();
+                    int64_t nt = A.nt();
+                    std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        i_indices.insert(rand() % mt);
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
-                        int64_t ib = std::min(m - i*nb, nb);
-                        slate_assert(ib == A.tileMb(i));
-
-                        // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                        // up to 25 entries per tile.
-                        // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                        std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                        std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                        // todo: complex peak
-                        scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                        if (rand() < RAND_MAX / 2)
-                            peak *= -1;
-                        if (rand() < RAND_MAX / 20)
-                            peak = nan("");
-                        scalar_t save = 0;
-
-                        for (auto jj : jj_indices) {
-                            if (jj < 0 || jj >= jb)
-                                continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                            for (auto ii : ii_indices) {
-                                if (ii < 0 || ii >= ib) {
+                        for (auto i : i_indices) {
+                            if (i < 0 || i >= mt)
+                                continue;
+                            int64_t ib = std::min(m - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
                                     continue;
-                                }
 
-                                int64_t ilocal = int(i / p)*nb + ii;
-                                int64_t jlocal = int(j / q)*nb + jj;
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    save = T(ii, jj);
-                                    T.at(ii, jj) = peak;
-                                    A_data[ ilocal + jlocal*lldA ] = peak;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                                }
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib) {
+                                        continue;
+                                    }
 
-                                A_norm = slate::norm(norm, A, opts);
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the genorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
 
-                                A_norm_ref = scalapack_plange(
-                                                 norm2str(norm), m, n,
-                                                 &A_data[0], 1, 1, A_desc,
-                                                 &worklange[0]);
+                                    A_norm = slate::norm(norm, A, opts);
 
-                                // difference between norms
-                                error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                                if (norm == slate::Norm::One) {
-                                    error /= sqrt(m);
-                                }
-                                else if (norm == slate::Norm::Inf) {
-                                    error /= sqrt(n);
-                                }
-                                else if (norm == slate::Norm::Fro) {
-                                    error /= sqrt(m*n);
-                                }
+                                    A_norm_ref = scalapack_plange(
+                                                     norm2str(norm), m, n,
+                                                     &A_data[0], 1, 1, A_desc,
+                                                     &worklange[0]);
 
-                                if (mpi_rank == 0) {
-                                    // if peak is nan, expect A_norm to be nan.
-                                    bool okay = (std::isnan(real(peak))
-                                                 ? std::isnan(A_norm)
-                                                 : error <= tol);
-                                    params.okay() = params.okay() && okay;
-                                    if (verbose || ! okay) {
-                                        printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                               llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                               real(peak), A_norm, A_norm_ref, error,
-                                               (okay ? "pass" : "failed"));
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One) {
+                                        error /= sqrt(m);
+                                    }
+                                    else if (norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(m*n);
                                     }
-                                }
 
-                                if (A.tileIsLocal(i, j)) {
-                                    A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                    auto T = A(i, j);
-                                    T.at(ii, jj) = save;
-                                    A_data[ ilocal + jlocal*lldA ] = save;
-                                    // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                    A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
+                                    }
+
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the genorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
                         }
@@ -351,7 +332,7 @@ void test_genorm_work(Params& params, bool run)
             SLATE_UNUSED( A_norm );
             SLATE_UNUSED( extended );
             SLATE_UNUSED( verbose );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_geqrf.cc b/test/test_geqrf.cc
index 6ae7d8302..18c8bf114 100644
--- a/test/test_geqrf.cc
+++ b/test/test_geqrf.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_gesv.cc b/test/test_gesv.cc
index a19beadb8..1e7b00541 100644
--- a/test/test_gesv.cc
+++ b/test/test_gesv.cc
@@ -8,11 +8,11 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
@@ -59,8 +59,6 @@ void test_gesv_work(Params& params, bool run)
 
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t ib = params.ib();
     int64_t lookahead = params.lookahead();
     int64_t panel_threads = params.panel_threads();
@@ -68,13 +66,10 @@ void test_gesv_work(Params& params, bool run)
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
     bool trace = params.trace() == 'y';
-    bool nonuniform_nb = params.nonuniform_nb() == 'y';
     int verbose = params.verbose();
     int timer_level = params.timer_level();
     SLATE_UNUSED(verbose);
-    slate::Origin origin = params.origin();
     slate::Target target = params.target();
-    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
     params.matrixB.mark();
 
@@ -156,8 +151,8 @@ void test_gesv_work(Params& params, bool run)
     if (! run)
         return;
 
-    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
-        params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK";
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
         return;
     }
 
@@ -385,14 +380,10 @@ void test_gesv_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
-            if (nonuniform_nb) {
-                params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
-                return;
-            }
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, Aref_desc[9], Bref_desc[9];
-            create_ScaLAPACK_context( grid_order, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
             B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
@@ -433,7 +424,7 @@ void test_gesv_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_getri.cc b/test/test_getri.cc
index 6a0343e52..3693dcba1 100644
--- a/test/test_getri.cc
+++ b/test/test_getri.cc
@@ -10,7 +10,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hb2st.cc b/test/test_hb2st.cc
index 5c3f4bce7..607250fe1 100644
--- a/test/test_hb2st.cc
+++ b/test/test_hb2st.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
-#include "scalapack_support_routines.hh"
 
 #include <cmath>
 #include <cstdio>
diff --git a/test/test_hbnorm.cc b/test/test_hbnorm.cc
index c62de133a..bf5c5cd2c 100644
--- a/test/test_hbnorm.cc
+++ b/test/test_hbnorm.cc
@@ -7,7 +7,6 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
diff --git a/test/test_heev.cc b/test/test_heev.cc
index 505c1c0f3..fc7077f2a 100644
--- a/test/test_heev.cc
+++ b/test/test_heev.cc
@@ -9,7 +9,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hegst.cc b/test/test_hegst.cc
index 93d78d275..35574087d 100644
--- a/test/test_hegst.cc
+++ b/test/test_hegst.cc
@@ -10,7 +10,6 @@
 #include "print_matrix.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "auxiliary/Debug.hh"
 #include "grid_utils.hh"
diff --git a/test/test_hegv.cc b/test/test_hegv.cc
index d342be443..f6061b0a1 100644
--- a/test/test_hegv.cc
+++ b/test/test_hegv.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_hemm.cc b/test/test_hemm.cc
index 946a60798..d874c40ac 100644
--- a/test/test_hemm.cc
+++ b/test/test_hemm.cc
@@ -8,10 +8,11 @@
 #include "blas/flops.hh"
 #include "print_matrix.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -42,10 +43,7 @@ void test_hemm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -59,6 +57,9 @@ void test_hemm_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -72,6 +73,11 @@ void test_hemm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -85,93 +91,48 @@ void test_hemm_work(Params& params, bool run)
 
     // sizes of data
     int64_t An = (side == slate::Side::Left ? m : n);
-    int64_t Am = An;
     int64_t Bm = m;
     int64_t Bn = n;
     int64_t Cm = m;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
-
-    slate::HermitianMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::HermitianMatrix<scalar_t>(uplo, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
 
-        C = slate::Matrix<scalar_t>(Cm, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A);
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        std::vector<scalar_t> Cref_data;
-        slate::Matrix<scalar_t> Cref;
-        if (check || ref) {
-            // For simplicity, always use ScaLAPACK format for ref matrices.
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy(C, Cref);
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         if (side == slate::Side::Left ) {
             // Compute Y = alpha A * (B * X) + (beta C * X).
@@ -237,6 +198,9 @@ void test_hemm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -255,56 +219,31 @@ void test_hemm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
 
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
                 copy(A, &A_data[0], A_desc);
                 copy(B, &B_data[0], B_desc);
                 copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocA + mlocA + ldw);
-            std::vector<real_t> worklange(std::max({mlocC, nlocC, mlocB, nlocB}));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An,
-                                             &A_data[0], 1, 1, A_desc, &worklansy[0]);
-            real_t B_norm = scalapack_plange(
-                norm2str(norm), Bm, Bn, &B_data[0],1, 1, B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plange(
-                norm2str(norm), Cm, Cn, &Cref_data[0], 1, 1, Cref_desc, &worklange[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -315,12 +254,13 @@ void test_hemm_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
+
+            print_matrix( "Diff", C, params );
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -336,7 +276,7 @@ void test_hemm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_henorm.cc b/test/test_henorm.cc
index 08b5fd4bc..a9a24a9fe 100644
--- a/test/test_henorm.cc
+++ b/test/test_henorm.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -36,12 +37,17 @@ void test_henorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -51,43 +57,25 @@ void test_henorm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( ref_copy, false, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
-
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::HermitianMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
+    slate::generate_matrix(params.matrix, A);
 
-    slate::HermitianMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::HermitianMatrix<scalar_t>(uplo, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-    }
-
-    slate::generate_matrix( params.matrix, A );
 
     print_matrix("A", A, params);
 
@@ -109,41 +97,31 @@ void test_henorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
-
-        // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
-        }
-
-        if (check || ref) {
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+                copy(A, &A_data[0], A_desc);
+            }
+
             // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
+            int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
                                       scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
+            int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
             std::vector<real_t> worklanhe( lwork );
 
+            // comparison with reference routine from ScaLAPACK
+
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -153,10 +131,6 @@ void test_henorm_work(Params& params, bool run)
                                     n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lanhe(
-            //    norm, A.uplo(),
-            //    n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
@@ -166,7 +140,7 @@ void test_henorm_work(Params& params, bool run)
                 error /= n;  // = sqrt( n*n );
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -184,140 +158,131 @@ void test_henorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
-            std::vector<real_t> worklanhe(lwork);
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if nt is small, so check in loops.
-            int64_t nt = A.nt();
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                j_indices.insert(rand() % nt);
-            }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : j_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
-                        continue;
-                    int64_t ib = std::min(n - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
+                }
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if nt is small, so check in loops.
+                    int64_t nt = A.nt();
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                        for (auto i : j_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
                                 continue;
-                            }
-
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
-
-                            A_norm = slate::norm(norm, A, opts);
-
-                            real_t A_norm_ref = scalapack_planhe(
-                                                    norm2str(norm), uplo2str(A.uplo()),
-                                                    n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
-
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(n*n);
-                            }
-
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan.
-                                bool okay = (std::isnan(real(peak))
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real(peak), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                            int64_t ib = std::min(n - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
+                                    continue;
+
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
+
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
+
+                                    A_norm = slate::norm(norm, A, opts);
+
+                                    A_norm_ref = scalapack_planhe(
+                                                     norm2str(norm), uplo2str(A.uplo()),
+                                                     n, &A_data[0], 1, 1, A_desc, &worklanhe[0]);
+
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(n*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
+                                    }
+
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
-
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
                         }
                     }
                 }
             }
-        }
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else  // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if ((check || ref) && mpi_rank == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else  // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/test/test_her2k.cc b/test/test_her2k.cc
index a2af3cef2..7fe3ed067 100644
--- a/test/test_her2k.cc
+++ b/test/test_her2k.cc
@@ -8,10 +8,11 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,10 +39,7 @@ void test_her2k_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     real_t beta = params.beta.get<real_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -53,6 +51,9 @@ void test_her2k_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -66,6 +67,11 @@ void test_her2k_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -81,72 +87,30 @@ void test_her2k_work(Params& params, bool run)
     int64_t An = (trans == slate::Op::NoTrans ? k : n);
     int64_t Bm = Am;
     int64_t Bn = An;
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector< scalar_t > A_data(lldA*nlocA);
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-    std::vector< scalar_t > B_data(lldB*nlocB);
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-    std::vector< scalar_t > C_data(lldC*nlocC);
-
-    slate::Matrix<scalar_t> A, B;
-    slate::HermitianMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::HermitianMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_HermitianMatrix<scalar_t>( ref, true, Cn, params );
+
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        slate::HermitianMatrix<scalar_t> Cref;
-        std::vector< scalar_t > Cref_data;
-        if (ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-            print_matrix("Initial Cref", Cref, params);
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A and B matrices,
     // and make a shallow copy of them for transposing.
@@ -172,17 +136,19 @@ void test_her2k_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = (alpha A (B^H X)) + (conj(alpha) B (A^H X)) + (beta C X).
         // Y = beta C X
@@ -221,6 +187,9 @@ void test_her2k_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -239,54 +208,30 @@ void test_her2k_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
-            copy( B, &B_data[0], B_desc );
-            copy( C, &C_data[0], C_desc );
-
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy( 2*nlocC + mlocC + ldw );
-            std::vector<real_t> worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) );
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1,
-                                             A_desc, &worklange[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1,
-                                             B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
+                // Copy SLATE result back from GPU or CPU tiles.
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+                copy(C, &C_data[0], C_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -300,13 +245,11 @@ void test_her2k_work(Params& params, bool run)
 
             print_matrix("Cref", Cref, params);
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -323,7 +266,7 @@ void test_her2k_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_herk.cc b/test/test_herk.cc
index 8e5660241..2c86f7500 100644
--- a/test/test_herk.cc
+++ b/test/test_herk.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -35,10 +36,7 @@ void test_herk_work(Params& params, bool run)
     int64_t k = params.dim.k();
     real_t alpha = params.alpha.get<real_t>();
     real_t beta = params.beta.get<real_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -47,7 +45,10 @@ void test_herk_work(Params& params, bool run)
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
     params.matrix.mark();
-    params.matrixB.mark();
+    params.matrixC.mark();
+
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
 
     // mark non-standard output values
     params.time();
@@ -62,6 +63,11 @@ void test_herk_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -75,64 +81,26 @@ void test_herk_work(Params& params, bool run)
     // setup so op(A) is n-by-k
     int64_t Am = (transA == slate::Op::NoTrans ? n : k);
     int64_t An = (transA == slate::Op::NoTrans ? k : n);
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        C_data.resize( lldC * nlocC );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto C_alloc = allocate_test_HermitianMatrix<scalar_t>( ref, true, Cn, params );
 
-    slate::Matrix<scalar_t> A;
-    slate::HermitianMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        C = slate::HermitianMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        C = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
-    slate::generate_matrix( params.matrixB, C );
-
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        slate::HermitianMatrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
+    slate::generate_matrix( params.matrixC, C );
+
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -147,17 +115,19 @@ void test_herk_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD);
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = alpha A (A^H X) + (beta C X).
         // Y = beta C X
@@ -189,6 +159,9 @@ void test_herk_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -207,48 +180,26 @@ void test_herk_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
-            copy( C, &C_data[0], C_desc );
-
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocC + mlocC + ldw);
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1,
-                                             A_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
+                // Copy SLATE result back from GPU or CPU tiles.
+                copy(A, &A_data[0], A_desc);
+                copy(C, &C_data[0], C_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -259,13 +210,11 @@ void test_herk_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn,
-                                                  &Cref_data[0], 1, 1, Cref_desc,
-                                                  &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm
@@ -281,7 +230,7 @@ void test_herk_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_hesv.cc b/test/test_hesv.cc
index 2e8f54af1..8dbe4b83c 100644
--- a/test/test_hesv.cc
+++ b/test/test_hesv.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "grid_utils.hh"
 #include "print_matrix.hh"
diff --git a/test/test_pocondest.cc b/test/test_pocondest.cc
index 47d45109e..081e03e29 100644
--- a/test/test_pocondest.cc
+++ b/test/test_pocondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_posv.cc b/test/test_posv.cc
index 46f3c3cc6..398ac323b 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -9,11 +9,11 @@
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
-#include "scalapack_copy.hh"
 #include "auxiliary/Debug.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -33,8 +33,6 @@ void test_posv_work(Params& params, bool run)
     slate::Uplo uplo = params.uplo();
     int64_t n = params.dim.n();
     int64_t nrhs = params.nrhs();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     bool ref_only = params.ref() == 'o';
@@ -46,12 +44,14 @@ void test_posv_work(Params& params, bool run)
     int timer_level = params.timer_level();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
-    slate::Dist dev_dist = params.dev_dist();
     params.matrix.mark();
     params.matrixB.mark();
     slate::Method methodTrsm = params.method_trsm();
     slate::Method methodHemm = params.method_hemm();
 
+    mark_params_for_test_HermitianMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // Currently only posv* supports timer_level >= 2.
     std::vector<std::string> timer_lvl_support{ "posv", "posv_mixed",
                                                 "posv_mixed_gmres" };
@@ -124,6 +124,11 @@ void test_posv_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -134,16 +139,6 @@ void test_posv_work(Params& params, bool run)
         {slate::Option::UseFallbackSolver, fallback},
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) {
-        params.msg() = "skipping: dev_dist = Row applies only to target devices";
-        return;
-    }
-
     if ((params.routine == "posv_mixed" || params.routine == "posv_mixed_gmres")
         && ! std::is_same<real_t, double>::value) {
         params.msg() = "skipping: unsupported mixed precision; must be type=d or z";
@@ -158,95 +153,31 @@ void test_posv_work(Params& params, bool run)
 
     int64_t info = 0;
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::HermitianMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
-
-    slate::HermitianMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, X;
-    std::vector<scalar_t> X_data;
-    if (origin != slate::Origin::ScaLAPACK) {
-        if (dev_dist == slate::Dist::Row && target == slate::Target::Devices) {
-            // slate_assert(target == slate::Target::Devices);
-            // todo: doesn't work when lookahead is greater than 2
-            // slate_assert(lookahead < 3);
-
-            auto tileNb = slate::func::uniform_blocksize( n, nb );
-            auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col,
-                                                          p, q );
-            int num_devices = blas::get_device_count();
-            auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col,
-                                                           p, num_devices );
-
-            A = slate::HermitianMatrix<scalar_t>(
-                    uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-            B = slate::Matrix<scalar_t>(
-                    n, nrhs, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD);
-        }
-        else {
-            A = slate::HermitianMatrix<scalar_t>(
-                    uplo, n, nb, p, q, MPI_COMM_WORLD);
-            B = slate::Matrix<scalar_t>(
-                    n, nrhs, nb, p, q, MPI_COMM_WORLD);
-        }
-
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A.insertLocalTiles(origin_target);
-
-        B.insertLocalTiles(origin_target);
-
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            X = slate::Matrix<scalar_t>(n, nrhs, nb, p, q, MPI_COMM_WORLD);
-            X.insertLocalTiles(origin_target);
-        }
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layouts
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        A = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                n, nrhs, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        if (is_iterative) {
-            X_data.resize(lldB*nlocB);
-            X = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    n, nrhs, &X_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        }
+    auto A_alloc = allocate_test_HermitianMatrix<scalar_t>( check || ref, true, n, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, n, nrhs, params );
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc;
+    if (is_iterative) {
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
     }
 
+    auto& A         = A_alloc.A;
+    auto& A_data    = A_alloc.A_data;
+    auto& Aref      = A_alloc.Aref;
+    auto& Aref_data = A_alloc.Aref_data;
+    auto& B         = B_alloc.A;
+    auto& B_data    = B_alloc.A_data;
+    auto& Bref      = B_alloc.Aref;
+    auto& Bref_data = B_alloc.Aref_data;
+    auto& X         = X_alloc.A;
+
     slate::generate_matrix(params.matrix, A);
     slate::generate_matrix(params.matrixB, B);
     print_matrix("A", A, params);
     print_matrix("B", B, params);
 
     // if check is required, copy test data and create a descriptor for it
-    std::vector<scalar_t> Aref_data(lldA*nlocA);
-    std::vector<scalar_t> Bref_data(lldB*nlocB);
     std::vector<scalar_t> B_orig;
-    slate::HermitianMatrix<scalar_t> Aref;
-    slate::Matrix<scalar_t> Bref;
     if (check || ref) {
-        // SLATE matrix wrappers for the reference data
-        Aref = slate::HermitianMatrix<scalar_t>::fromScaLAPACK(
-                   uplo, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   n, nrhs, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-
         slate::copy( A, Aref );
         slate::copy( B, Bref );
 
@@ -413,35 +344,12 @@ void test_posv_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // A comparison with a reference routine from ScaLAPACK for timing only
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], Aref_desc[9];
-            blas_int B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Aref_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, Aref_desc[9], Bref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
             if (check) {
                 // restore Bref_data
@@ -477,9 +385,13 @@ void test_posv_work(Params& params, bool run)
 
             if (verbose > 2) {
                 if (origin == slate::Origin::ScaLAPACK) {
-                    slate::Debug::diffLapackMatrices<scalar_t>(n, n, &A_data[0], lldA, &Aref_data[0], lldA, nb, nb);
+                    slate::Debug::diffLapackMatrices<scalar_t>(
+                        n, n, &A_data[0], A_alloc.lld,
+                        &Aref_data[0], A_alloc.lld, nb, nb);
                     if (params.routine != "potrf") {
-                        slate::Debug::diffLapackMatrices<scalar_t>(n, nrhs, &B_data[0], lldB, &Bref_data[0], lldB, nb, nb);
+                        slate::Debug::diffLapackMatrices<scalar_t>(
+                            n, nrhs, &B_data[0], B_alloc.lld,
+                            &Bref_data[0], B_alloc.lld, nb, nb);
                     }
                 }
             }
@@ -487,7 +399,7 @@ void test_posv_work(Params& params, bool run)
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( verbose );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_potri.cc b/test/test_potri.cc
index 647181acd..07d926ffc 100644
--- a/test/test_potri.cc
+++ b/test/test_potri.cc
@@ -10,7 +10,6 @@
 #include "print_matrix.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_scale.cc b/test/test_scale.cc
index a98ffcb2d..c04652e39 100644
--- a/test/test_scale.cc
+++ b/test/test_scale.cc
@@ -7,10 +7,9 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
@@ -49,8 +48,6 @@ void test_scale_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -67,6 +64,11 @@ void test_scale_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
@@ -124,7 +126,7 @@ void test_scale_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::Max, A );
@@ -164,7 +166,7 @@ void test_scale_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc
index 7c1ed4ffd..18e0819fd 100644
--- a/test/test_scale_row_col.cc
+++ b/test/test_scale_row_col.cc
@@ -7,10 +7,9 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
@@ -57,6 +56,7 @@ void test_scale_row_col_work( Params& params, bool run )
     bool trace = params.trace() == 'y';
     int verbose = params.verbose();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
     mark_params_for_test_Matrix( params );
@@ -68,6 +68,11 @@ void test_scale_row_col_work( Params& params, bool run )
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
@@ -138,7 +143,7 @@ void test_scale_row_col_work( Params& params, bool run )
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_max = slate::norm( slate::Norm::Max, A );
@@ -146,7 +151,7 @@ void test_scale_row_col_work( Params& params, bool run )
             std::vector<real_t> Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc );
 
             int myrow, mycol;
-            gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol );
+            gridinfo( A.mpiRank(), grid_order, p, q, &myrow, &mycol );
 
             // Copy local part of R.
             int64_t ii = 0, iilocal = 0;
@@ -231,7 +236,7 @@ void test_scale_row_col_work( Params& params, bool run )
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
             SLATE_UNUSED( verbose );
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_set.cc b/test/test_set.cc
index 2dc088f0a..85eed4200 100644
--- a/test/test_set.cc
+++ b/test/test_set.cc
@@ -7,10 +7,9 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
 #include "matrix_utils.hh"
 #include "test_utils.hh"
 
@@ -49,8 +48,6 @@ void test_set_work(Params& params, bool run)
     else {
         n = params.dim.n();
     }
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
     bool ref_only = params.ref() == 'o';
     bool ref = params.ref() == 'y' || ref_only;
     bool check = params.check() == 'y' && ! ref_only;
@@ -67,6 +64,11 @@ void test_set_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
@@ -124,7 +126,7 @@ void test_set_work(Params& params, bool run)
 
             // initialize BLACS and ScaLAPACK
             blas_int ictxt, A_desc[9];
-            create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt );
+            A_alloc.create_ScaLAPACK_context( &ictxt );
             A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
 
             real_t A_norm = slate::norm( slate::Norm::One, A );
@@ -160,7 +162,7 @@ void test_set_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_stedc.cc b/test/test_stedc.cc
index b28271c10..73ff18ec9 100644
--- a/test/test_stedc.cc
+++ b/test/test_stedc.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 #include "matrix_generator.hh"
diff --git a/test/test_stedc_deflate.cc b/test/test_stedc_deflate.cc
index 9ad2dedd8..077710af2 100644
--- a/test/test_stedc_deflate.cc
+++ b/test/test_stedc_deflate.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_secular.cc b/test/test_stedc_secular.cc
index 2e5aeabaf..e321bb8cd 100644
--- a/test/test_stedc_secular.cc
+++ b/test/test_stedc_secular.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_sort.cc b/test/test_stedc_sort.cc
index 96aa1d451..26eff3fd7 100644
--- a/test/test_stedc_sort.cc
+++ b/test/test_stedc_sort.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_stedc_z_vector.cc b/test/test_stedc_z_vector.cc
index bc8a5013f..64ca668a2 100644
--- a/test/test_stedc_z_vector.cc
+++ b/test/test_stedc_z_vector.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_steqr2.cc b/test/test_steqr2.cc
index 4d092669e..ff5ec29b9 100644
--- a/test/test_steqr2.cc
+++ b/test/test_steqr2.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 
-#include "scalapack_support_routines.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_sterf.cc b/test/test_sterf.cc
index 1d0a7f2a5..5488ef7fe 100644
--- a/test/test_sterf.cc
+++ b/test/test_sterf.cc
@@ -7,7 +7,7 @@
 #include "blas.hh"
 #include "test.hh"
 #include "print_matrix.hh"
-#include "scalapack_support_routines.hh"
+
 #include "band_utils.hh"
 #include "grid_utils.hh"
 
diff --git a/test/test_svd.cc b/test/test_svd.cc
index 2a1fb276b..36c4e9ba3 100644
--- a/test/test_svd.cc
+++ b/test/test_svd.cc
@@ -8,10 +8,11 @@
 #include "blas/flops.hh"
 #include "lapack/flops.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
@@ -35,9 +36,6 @@ void test_svd_work( Params& params, bool run )
     lapack::Job jobvt = params.jobvt();
     int64_t m = params.dim.m();
     int64_t n = params.dim.n();
-    int64_t p = params.grid.m();
-    int64_t q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t ib = params.ib();
     int64_t panel_threads = params.panel_threads();
     int64_t lookahead = params.lookahead();
@@ -51,6 +49,10 @@ void test_svd_work( Params& params, bool run )
     slate::Target target = params.target();
     params.matrix.mark();
 
+    mark_params_for_test_Matrix( params );
+    // nonuniform nb is not always supported in the reduction to band
+    params.nonuniform_nb.used( false );
+
     params.time();
     params.ref_time();
     params.error2();
@@ -86,6 +88,11 @@ void test_svd_work( Params& params, bool run )
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     if (check && ! ref
         && (jobu == slate::Job::NoVec || jobvt == slate::Job::NoVec)) {
         params.msg() = "job = NoVec requires --ref y to check singular values";
@@ -98,48 +105,6 @@ void test_svd_work( Params& params, bool run )
         {slate::Option::InnerBlocking, ib}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    int64_t min_mn = std::min(m, n);
-
-    // Figure out local size.
-    // matrix A (local input), m-by-n
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-    std::vector<scalar_t> A_data;
-    std::vector<scalar_t> Acpy_data;
-
-    // U  is either m-by-min( m, n ) for some vec, or m-by-m for all vec;
-    // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec.
-    int64_t Um  = m;
-    int64_t Un  = jobu  == slate::Job::AllVec ? m : min_mn;
-    int64_t VTm = jobvt == slate::Job::AllVec ? n : min_mn;
-    int64_t VTn = n;
-
-    // matrix U (local output), U(m, min_mn), singular values of A
-    int64_t mlocU = num_local_rows_cols( Um, nb, myrow, p );
-    int64_t nlocU = num_local_rows_cols( Un, nb, mycol, q );
-    int64_t lldU  = blas::max(1, mlocU); // local leading dimension of U
-    std::vector<scalar_t> U_data(1);
-
-    // matrix VT (local output), VT(min_mn, n)
-    int64_t mlocVT = num_local_rows_cols( VTm, nb, myrow, p );
-    int64_t nlocVT = num_local_rows_cols( VTn, nb, mycol, q );
-    int64_t lldVT  = blas::max(1, mlocVT); // local leading dimension of VT
-    std::vector<scalar_t> VT_data(1);
-
-    // array Sigma (global output), singular values of A
-    std::vector<real_t> Sigma(min_mn);
-
-    slate::Matrix<scalar_t> A; // (m, n);
-    slate::Matrix<scalar_t> U; // (m, min_mn);
-    slate::Matrix<scalar_t> VT; // (min_mn, n);
-    slate::Matrix<scalar_t> Acpy;
-
     bool wantu  = (jobu  == slate::Job::Vec
                    || jobu  == slate::Job::AllVec
                    || jobu  == slate::Job::SomeVec);
@@ -147,45 +112,28 @@ void test_svd_work( Params& params, bool run )
                    || jobvt == slate::Job::AllVec
                    || jobvt == slate::Job::SomeVec);
 
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
+    int64_t min_mn = std::min(m, n);
 
-        Acpy = slate::Matrix<scalar_t>(m, n, nb, p, q, MPI_COMM_WORLD);
-        Acpy.insertLocalTiles(origin_target);
+    // U  is either m-by-min( m, n ) for some vec, or m-by-m for all vec;
+    // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec.
+    int64_t Um  = wantu  ? m : 0;
+    int64_t Un  = wantu  ? (jobu  == slate::Job::AllVec ? m : min_mn) : 0;
+    int64_t VTm = wantvt ? (jobvt == slate::Job::AllVec ? n : min_mn) : 0;
+    int64_t VTn = wantvt ? n : 0;
 
-        if (wantu) {
-            U = slate::Matrix<scalar_t>( Um, Un, nb, p, q, MPI_COMM_WORLD );
-            U.insertLocalTiles(origin_target);
-        }
-        if (wantvt) {
-            VT = slate::Matrix<scalar_t>( VTm, VTn, nb, p, q, MPI_COMM_WORLD );
-            VT.insertLocalTiles(origin_target);
-        }
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A_data.resize( lldA * nlocA );
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &A_data[0],  lldA,  nb, p, q, MPI_COMM_WORLD);
+    // array Sigma (global output), singular values of A
+    std::vector<real_t> Sigma(min_mn);
 
-        Acpy_data.resize( lldA * nlocA );
-        Acpy = slate::Matrix<scalar_t>::fromScaLAPACK(
-                m, n, &Acpy_data[0],  lldA,  nb, p, q, MPI_COMM_WORLD);
+    auto A_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, m, n, params );
+    auto U_alloc = allocate_test_Matrix<scalar_t>( false, true, Um, Un, params );
+    auto VT_alloc = allocate_test_Matrix<scalar_t>( false, true, VTm, VTn, params );
+    // TODO Acpy isn't always needed
+    auto Acpy_alloc = allocate_test_Matrix<scalar_t>( false, true, m, n, params );
 
-        if (wantu) {
-            U_data.resize(lldU*nlocU);
-            U = slate::Matrix<scalar_t>::fromScaLAPACK(
-                    Um, Un, &U_data[0], lldU, nb, p, q, MPI_COMM_WORLD );
-        }
-        if (wantvt) {
-            VT_data.resize(lldVT*nlocVT);
-            VT = slate::Matrix<scalar_t>::fromScaLAPACK(
-                     VTm, VTn, &VT_data[0], lldVT, nb, p, q, MPI_COMM_WORLD );
-        }
-    }
+    auto& A         = A_alloc.A;
+    auto& U         = U_alloc.A;
+    auto& VT        = VT_alloc.A;
+    auto& Acpy      = Acpy_alloc.A;
 
     if (verbose >= 1) {
         printf( "%% A   %6lld-by-%6lld\n", llong(   A.m() ), llong(   A.n() ) );
@@ -201,15 +149,10 @@ void test_svd_work( Params& params, bool run )
     slate::generate_matrix( params.matrix, A);
     print_matrix( "A",  A, params );
 
-    slate::Matrix<scalar_t> Aref;
     std::vector<real_t> Sigma_ref;
-    std::vector<scalar_t> Aref_data;
     if (check || ref) {
         Sigma_ref.resize( min_mn );
-        Aref_data.resize( lldA * nlocA );
-        Aref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD );
-        slate::copy( A, Aref );
+        slate::copy( A, A_alloc.Aref );
         slate::copy( A, Acpy );
     }
 
@@ -272,8 +215,8 @@ void test_svd_work( Params& params, bool run )
             Rm = blas::max( Rm, m );
         if (jobvt == slate::Job::AllVec)
             Rm = blas::max( Rm, n );
-        slate::Matrix<scalar_t> R( Rm, Rm, nb, p, q, MPI_COMM_WORLD );
-        R.insertLocalTiles();
+        auto R_alloc = allocate_test_Matrix<scalar_t>( false, true, Rm, Rm, params );
+        auto R = R_alloc.A;
 
         if (wantu) {
             //==================================================
@@ -344,42 +287,21 @@ void test_svd_work( Params& params, bool run )
         #ifdef SLATE_HAVE_SCALAPACK
             // Run reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            blas_int A_desc[9];
-            scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            blas_int U_desc[9];
-            scalapack_descinit(U_desc, m, min_mn, nb, nb, 0, 0, ictxt, mlocU, &info);
-            slate_assert(info == 0);
-
-            blas_int VT_desc[9];
-            scalapack_descinit(VT_desc, min_mn, n, nb, nb, 0, 0, ictxt, mlocVT, &info);
-            slate_assert(info == 0);
-
-            // Allocate U and VT if not already allocated.
-            // If origin=scalapack, just overwrite SLATE's U and VT.
-            if (wantu) {
-                U_data.resize( lldU * nlocU );
-            }
-            if (wantvt) {
-                VT_data.resize( lldVT * nlocVT );
+            blas_int ictxt, A_desc[9], U_desc[9], VT_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            U_alloc.ScaLAPACK_descriptor( ictxt, U_desc );
+            VT_alloc.ScaLAPACK_descriptor( ictxt, VT_desc );
+
+            auto& Aref_data = A_alloc.Aref_data;
+            auto& U_data = U_alloc.A_data;
+            auto& VT_data = VT_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                U_data.resize( U_alloc.lld * U_alloc.nloc );
+                VT_data.resize( VT_alloc.lld * VT_alloc.nloc );
             }
 
             // ScaLAPACK uses job = N and V (same as S);
@@ -444,7 +366,7 @@ void test_svd_work( Params& params, bool run )
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_symm.cc b/test/test_symm.cc
index c80dd3d98..8d6ebed19 100644
--- a/test/test_symm.cc
+++ b/test/test_symm.cc
@@ -6,11 +6,13 @@
 #include "slate/slate.hh"
 #include "test.hh"
 #include "blas/flops.hh"
+#include "print_matrix.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -35,10 +37,7 @@ void test_symm_work(Params& params, bool run)
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -50,6 +49,9 @@ void test_symm_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -63,6 +65,11 @@ void test_symm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -73,102 +80,48 @@ void test_symm_work(Params& params, bool run)
 
     // sizes of data
     int64_t An = (side == slate::Side::Left ? m : n);
-    int64_t Am = An;
     int64_t Bm = m;
     int64_t Bn = n;
     int64_t Cm = m;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
+    auto A_alloc = allocate_test_SymmetricMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_Matrix<scalar_t>( ref, true, Cm, Cn, params );
 
-    slate::SymmetricMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B, C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::SymmetricMatrix<scalar_t>(uplo, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::Matrix<scalar_t>(Cm, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A);
     slate::generate_matrix( params.matrixB, B);
     slate::generate_matrix( params.matrixC, C);
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data and create a descriptor for it.
-        slate::Matrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                       Cm, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
-
-    if (side == slate::Side::Left)
-        slate_assert(A.mt() == C.mt());
-    else
-        slate_assert(A.mt() == C.nt());
-    slate_assert(B.mt() == C.mt());
-    slate_assert(B.nt() == C.nt());
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
 
-    if (trace) slate::trace::Trace::on();
-    else slate::trace::Trace::off();
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         if (side == slate::Side::Left ) {
             // Compute Y = alpha A * (B * X) + (beta C * X).
@@ -192,6 +145,16 @@ void test_symm_work(Params& params, bool run)
             throw slate::Exception("unknown side");
     }
 
+    if (side == slate::Side::Left)
+        slate_assert(A.mt() == C.mt());
+    else
+        slate_assert(A.mt() == C.nt());
+    slate_assert(B.mt() == C.mt());
+    slate_assert(B.nt() == C.nt());
+
+    if (trace) slate::trace::Trace::on();
+    else slate::trace::Trace::off();
+
     double time = barrier_get_wtime(MPI_COMM_WORLD);
 
     //==================================================
@@ -218,6 +181,9 @@ void test_symm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -236,56 +202,31 @@ void test_symm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
 
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
                 copy(A, &A_data[0], A_desc);
                 copy(B, &B_data[0], B_desc);
                 copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            size_t ldw = nb*ceildiv( ceildiv( mlocA, nb ),
-                                     scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocA + mlocA + ldw);
-            std::vector<real_t> worklange(std::max({mlocC, nlocC, mlocB, nlocB}));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An,
-                                             &A_data[0], 1, 1, A_desc, &worklansy[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1,
-                                             B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -297,12 +238,13 @@ void test_symm_work(Params& params, bool run)
             MPI_Barrier(MPI_COMM_WORLD);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
+
+            print_matrix( "Diff", C, params );
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0],
-                                                  1, 1, Cref_desc, &worklange[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -317,7 +259,7 @@ void test_symm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_synorm.cc b/test/test_synorm.cc
index 0976e59c5..eb3dfc025 100644
--- a/test/test_synorm.cc
+++ b/test/test_synorm.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -36,12 +37,17 @@ void test_synorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -49,44 +55,26 @@ void test_synorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
+    auto A_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref_copy, false, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
-
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::SymmetricMatrix<scalar_t> A0(uplo, n, nb, p, q, MPI_COMM_WORLD);
+    slate::generate_matrix(params.matrix, A);
 
-    slate::SymmetricMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::SymmetricMatrix<scalar_t>(uplo, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix(params.matrix, A);
-
     print_matrix("A", A, params);
 
     if (trace) slate::trace::Trace::on();
@@ -107,40 +95,28 @@ void test_synorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
-
-        // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
-        }
-
-        if (check || ref) {
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+                copy(A, &A_data[0], A_desc);
+            }
+
             // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
+            int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ),
                                       scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
-            std::vector<real_t> worklansy(lwork);
+            int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw;
+            std::vector<real_t> worklansy( lwork );
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -151,10 +127,6 @@ void test_synorm_work(Params& params, bool run)
                                     n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lansy(
-            //    norm, A.uplo(),
-            //    n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
@@ -164,7 +136,7 @@ void test_synorm_work(Params& params, bool run)
                 error /= n;  // = sqrt( n*n );
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -182,140 +154,131 @@ void test_synorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            // allocate work space
-            int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            int64_t lwork = 2*mlocA + nlocA + ldw;
-            std::vector<real_t> worklansy(lwork);
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if nt is small, so check in loops.
-            int64_t nt = A.nt();
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                j_indices.insert(rand() % nt);
-            }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : j_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
-                        continue;
-                    int64_t ib = std::min(n - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
+                }
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if nt is small, so check in loops.
+                    int64_t nt = A.nt();
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                        for (auto i : j_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j))
                                 continue;
-                            }
-
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
-
-                            A_norm = slate::norm(norm, A, opts);
-
-                            real_t A_norm_ref = scalapack_plansy(
-                                                    norm2str(norm), uplo2str(A.uplo()),
-                                                    n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
-
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(n*n);
-                            }
-
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan.
-                                bool okay = (std::isnan(real(peak))
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real( peak ), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                            int64_t ib = std::min(n - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
+                                    continue;
+
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
+
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
+
+                                    A_norm = slate::norm(norm, A, opts);
+
+                                    A_norm_ref = scalapack_plansy(
+                                                     norm2str(norm), uplo2str(A.uplo()),
+                                                     n, &A_data[0], 1, 1, A_desc, &worklansy[0]);
+
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One || norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(n*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan.
+                                        bool okay = (std::isnan(real(peak))
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real( peak ), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
+                                    }
+
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
-
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
                         }
                     }
                 }
             }
-        }
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if ((check || ref) && mpi_rank == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc
index 2ae006ce1..cf051cabd 100644
--- a/test/test_syr2k.cc
+++ b/test/test_syr2k.cc
@@ -8,10 +8,11 @@
 #include "print_matrix.hh"
 #include "blas/flops.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,10 +39,7 @@ void test_syr2k_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -53,6 +51,9 @@ void test_syr2k_work(Params& params, bool run)
     params.matrixB.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -66,6 +67,11 @@ void test_syr2k_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -79,77 +85,30 @@ void test_syr2k_work(Params& params, bool run)
     int64_t An = (trans == slate::Op::NoTrans ? k : n);
     int64_t Bm = Am;
     int64_t Bn = An;
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-        C_data.resize( lldC * nlocC );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( false, true, Bm, Bn, params );
+    auto C_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref, true, Cn, params );
 
-    slate::Matrix<scalar_t> A, B;
-    slate::SymmetricMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-
-        C = slate::SymmetricMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-        C = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        slate::SymmetricMatrix<scalar_t> Cref;
-        std::vector< scalar_t > Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-            print_matrix("Initial Cref", Cref, params);
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        B_norm = slate::norm(norm, B);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A and B matrices,
     // and make a shallow copy of them for transposing.
@@ -175,17 +134,19 @@ void test_syr2k_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = (alpha A (B^T X)) + alpha B (A^T X)) + (beta C X).
         // Y = beta C X
@@ -224,6 +185,9 @@ void test_syr2k_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -242,54 +206,31 @@ void test_syr2k_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( B, &B_data[0], B_desc );
-                copy( C, &C_data[0], C_desc );
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+                copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy( 2*nlocC + mlocC + ldw );
-            std::vector<real_t> worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) );
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]);
-            real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, B_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -302,11 +243,11 @@ void test_syr2k_work(Params& params, bool run)
 
             print_matrix("Cref", Cref, params);
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm
@@ -323,7 +264,7 @@ void test_syr2k_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else   // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_syrk.cc b/test/test_syrk.cc
index 03d6de765..bae13cf10 100644
--- a/test/test_syrk.cc
+++ b/test/test_syrk.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -35,10 +36,7 @@ void test_syrk_work(Params& params, bool run)
     int64_t k = params.dim.k();
     scalar_t alpha = params.alpha.get<scalar_t>();
     scalar_t beta = params.beta.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
     int64_t nrhs = params.nrhs();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -49,6 +47,9 @@ void test_syrk_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixC.mark();
 
+    mark_params_for_test_SymmetricMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -62,6 +63,11 @@ void test_syrk_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -73,64 +79,26 @@ void test_syrk_work(Params& params, bool run)
     // setup so op(A) is n-by-k
     int64_t Am = (transA == slate::Op::NoTrans ? n : k);
     int64_t An = (transA == slate::Op::NoTrans ? k : n);
-    int64_t Cm = n;
     int64_t Cn = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix C: figure out local size.
-    int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p);
-    int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q);
-    int64_t lldC  = blas::max(1, mlocC); // local leading dimension of C
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, C_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        C_data.resize( lldC * nlocC );
-    }
+    auto A_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, An, params );
+    auto C_alloc = allocate_test_SymmetricMatrix<scalar_t>( ref, true, Cn, params );
 
-    slate::Matrix<scalar_t> A;
-    slate::SymmetricMatrix<scalar_t> C;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::Matrix<scalar_t>(Am, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        C = slate::SymmetricMatrix<scalar_t>(uplo, Cn, nb, p, q, MPI_COMM_WORLD);
-        C.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        C = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& C         = C_alloc.A;
+    auto& Cref      = C_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixC, C );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // If reference run is required, copy test data.
-        slate::SymmetricMatrix<scalar_t> Cref;
-        std::vector<scalar_t> Cref_data;
-        if (check || ref) {
-            Cref_data.resize( lldC * nlocC );
-            Cref = slate::SymmetricMatrix<scalar_t>::fromScaLAPACK(
-                       uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( C, Cref );
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, C_orig_norm=0;
+    if (ref) {
+        slate::copy( C, Cref );
+
+        A_norm = slate::norm(norm, A);
+        C_orig_norm = slate::norm(norm, Cref);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -145,17 +113,19 @@ void test_syrk_work(Params& params, bool run)
     else slate::trace::Trace::off();
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, Y, Z;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, Y_alloc, Z_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( An, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
-        Z = slate::Matrix<scalar_t>( Am, nrhs, nb, p, q, MPI_COMM_WORLD);
-        Z.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, An, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+        Z_alloc = allocate_test_Matrix<scalar_t>( false, true, Am, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+        auto& Z = Z_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
-        generate_matrix( mp, X );
+        slate::generate_matrix( mp, X );
 
         // Compute Y = alpha A (A^T X) + (beta C X).
         // Y = beta C X
@@ -187,6 +157,9 @@ void test_syrk_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, C*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -205,49 +178,27 @@ void test_syrk_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], C_desc[9], Cref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, C_desc );
+            C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& C_data = C_alloc.A_data;
+            auto& Cref_data = C_alloc.Aref_data;
 
             if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                C_data.resize( C_alloc.lld * C_alloc.nloc );
+
                 // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( C, &C_data[0], C_desc );
+                copy(A, &A_data[0], A_desc);
+                copy(C, &C_data[0], C_desc);
             }
 
-            // allocate workspace for norms
-            int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ),
-                                      scalapack_ilcm( p, q ) / p );
-            std::vector<real_t> worklansy(2*nlocC + mlocC + ldw);
-            std::vector<real_t> worklange(std::max(mlocA, nlocA));
-
-            // get norms of the original data
-            real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]);
-            real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
-
             //==================================================
             // Run ScaLAPACK reference routine.
             //==================================================
@@ -257,11 +208,11 @@ void test_syrk_work(Params& params, bool run)
                             &Cref_data[0], 1, 1, Cref_desc);
             time = barrier_get_wtime( MPI_COMM_WORLD ) - time;
 
-            // local operation: error = Cref_data - C_data
-            blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1);
+            // get differences C = C - Cref
+            slate::add(-one, Cref, one, C);
 
-            // norm(Cref_data - C_data)
-            real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]);
+            // norm(C - Cref)
+            real_t C_diff_norm = slate::norm(norm, C);
 
             real_t error = C_diff_norm
                          / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm
@@ -277,7 +228,7 @@ void test_syrk_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_tb2bd.cc b/test/test_tb2bd.cc
index a1ab1aafa..29204bda2 100644
--- a/test/test_tb2bd.cc
+++ b/test/test_tb2bd.cc
@@ -8,7 +8,6 @@
 #include "test.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
-#include "scalapack_support_routines.hh"
 
 #include <cmath>
 #include <cstdio>
diff --git a/test/test_tbsm.cc b/test/test_tbsm.cc
index 2442245b4..b7204bb90 100644
--- a/test/test_tbsm.cc
+++ b/test/test_tbsm.cc
@@ -8,7 +8,6 @@
 #include "blas/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "print_matrix.hh"
 #include "band_utils.hh"
 #include "grid_utils.hh"
diff --git a/test/test_trcondest.cc b/test/test_trcondest.cc
index 74fbac410..1301854fb 100644
--- a/test/test_trcondest.cc
+++ b/test/test_trcondest.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_trmm.cc b/test/test_trmm.cc
index 26e0b3b64..a8b6d08e6 100644
--- a/test/test_trmm.cc
+++ b/test/test_trmm.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 #include "blas/flops.hh"
 
+#include "matrix_utils.hh"
+#include "test_utils.hh"
+
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,9 +39,6 @@ void test_trmm_work(Params& params, bool run)
     int64_t nrhs = params.nrhs();
     scalar_t alpha = params.alpha.get<scalar_t>();
     slate::Op transB = params.transB();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -51,6 +49,9 @@ void test_trmm_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_TriangularMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -65,6 +66,11 @@ void test_trmm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target}
@@ -79,60 +85,24 @@ void test_trmm_work(Params& params, bool run)
     int64_t Bm = (transB == slate::Op::NoTrans ? m : n);
     int64_t Bn = (transB == slate::Op::NoTrans ? n : m);
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-        B_data.resize( lldB * nlocB );
-    }
+    auto A_alloc = allocate_test_TriangularMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( ref, true, Bm, Bn, params );
 
-    slate::TriangularMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B;
-    slate::Target origin_target = origin2target(origin);
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        A = slate::TriangularMatrix<scalar_t>(uplo, diag, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrices from the ScaLAPACK layouts.
-        A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& Bref      = B_alloc.Aref;
 
     generate_matrix( params.matrix, A );
     generate_matrix( params.matrixB, B );
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // if reference run is required, copy test data.
-        std::vector<scalar_t> Bref_data;
-        if (ref) {
-            Bref_data.resize( lldB * nlocB );
-            auto Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                            Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-            slate::copy( B, Bref );
-        }
-    #endif
+    // If reference run is required, record norms to be used in the check/ref.
+    real_t A_norm=0, B_orig_norm=0;
+    if (ref) {
+        slate::copy( B, Bref );
+
+        A_norm = slate::norm(norm, A);
+        B_orig_norm = slate::norm(norm, B);
+    }
 
     // Keep the original untransposed A matrix,
     // and make a shallow copy of it for transposing.
@@ -148,14 +118,16 @@ void test_trmm_work(Params& params, bool run)
         B = conj_transpose( B );
 
     // If check run, perform first half of SLATE residual check.
-    slate::Matrix<scalar_t> X, X2, Y;
+    TestMatrix<slate::Matrix<scalar_t>> X_alloc, X2_alloc, Y_alloc;
     if (check && ! ref) {
-        X = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X.insertLocalTiles(origin_target);
-        X2 = slate::Matrix<scalar_t>( n, nrhs, nb, p, q, MPI_COMM_WORLD );
-        X2.insertLocalTiles(origin_target);
-        Y = slate::Matrix<scalar_t>( m, nrhs, nb, p, q, MPI_COMM_WORLD );
-        Y.insertLocalTiles(origin_target);
+        X_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        X2_alloc = allocate_test_Matrix<scalar_t>( false, true, n, nrhs, params );
+        Y_alloc = allocate_test_Matrix<scalar_t>( false, true, m, nrhs, params );
+
+        auto& X = X_alloc.A;
+        auto& X2 = X2_alloc.A;
+        auto& Y = Y_alloc.A;
+
         MatrixParams mp;
         mp.kind.set_default( "rand" );
         generate_matrix( mp, X );
@@ -207,6 +179,9 @@ void test_trmm_work(Params& params, bool run)
     params.gflops() = gflop / time;
 
     if (check && ! ref) {
+        auto& X = X_alloc.A;
+        auto& Y = Y_alloc.A;
+
         // SLATE residual check.
         // Check error, B*X - Y.
         real_t y_norm = slate::norm( norm, Y, opts );
@@ -224,48 +199,27 @@ void test_trmm_work(Params& params, bool run)
     if (ref) {
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
 
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank == mpi_rank_ );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert(p == p_ && q == q_);
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
+            blas_int ictxt, A_desc[9], B_desc[9], Bref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
 
-            if (origin != slate::Origin::ScaLAPACK) {
-                // Copy SLATE result back from GPU or CPU tiles.
-                copy( A, &A_data[0], A_desc );
-                copy( B, &B_data[0], B_desc );
-            }
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, B_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
 
-            // allocate workspace for norms
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
-            std::vector<real_t> worklange(std::max(mlocB, nlocB));
+            auto& A_data = A_alloc.A_data;
+            auto& B_data = B_alloc.A_data;
+            auto& Bref_data = B_alloc.Aref_data;
 
-            // get norms of the original data
-            real_t A_norm = scalapack_plantr(
-                norm2str(norm), uplo2str(uplo), diag2str(diag), Am, An, &A_data[0],
-                1, 1, A_desc, &worklantr[0]);
-            real_t B_orig_norm = scalapack_plange(
-                norm2str(norm), Bm, Bn, &Bref_data[0], 1, 1, B_desc, &worklange[0]);
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+                B_data.resize( B_alloc.lld * B_alloc.nloc );
+
+                // Copy SLATE matrix into ScaLAPACK matrix
+                copy(A, &A_data[0], A_desc);
+                copy(B, &B_data[0], B_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -277,12 +231,11 @@ void test_trmm_work(Params& params, bool run)
                             &Bref_data[0], 1, 1, Bref_desc);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            // Local operation: error = Bref_data - B_data
-            blas::axpy(Bref_data.size(), -1.0, &B_data[0], 1, &Bref_data[0], 1);
+            // get differences B = B - Bref
+            slate::add(-one, Bref, one, B);
 
-            // norm(Bref_data - B_data)
-            real_t B_diff_norm = scalapack_plange(norm2str(norm), Bm, Bn, &Bref_data[0],
-                                                  1, 1, Bref_desc, &worklange[0]);
+            // norm(B - Bref)
+            real_t B_diff_norm = slate::norm(norm, B);
 
             real_t error = B_diff_norm
                          / (sqrt(real_t(Am) + 2) * std::abs(alpha) * A_norm * B_orig_norm);
@@ -298,7 +251,7 @@ void test_trmm_work(Params& params, bool run)
             Cblacs_gridexit(ictxt);
             //Cblacs_exit(1) does not handle re-entering
         #else  // not SLATE_HAVE_SCALAPACK
-            if (mpi_rank == 0)
+            if (A.mpiRank() == 0)
                 printf( "ScaLAPACK not available\n" );
         #endif
     }
diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc
index 8262e6ac9..bd8d87f67 100644
--- a/test/test_trnorm.cc
+++ b/test/test_trnorm.cc
@@ -7,10 +7,11 @@
 #include "test.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
-#include "grid_utils.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -38,12 +39,17 @@ void test_trnorm_work(Params& params, bool run)
     bool check = params.check() == 'y';
     bool ref = params.ref() == 'y';
     bool trace = params.trace() == 'y';
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+    bool ref_copy = nonuniform_nb && (check || ref);
     int verbose = params.verbose();
     int extended = params.extended();
     slate::Origin origin = params.origin();
     slate::Target target = params.target();
+    slate::GridOrder grid_order = params.grid_order();
     params.matrix.mark();
 
+    mark_params_for_test_TrapezoidMatrix( params );
+
     // mark non-standard output values
     params.time();
     params.ref_time();
@@ -51,55 +57,26 @@ void test_trnorm_work(Params& params, bool run)
     if (! run)
         return;
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params, true )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Target, target}
     };
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // upper requires m <= n,
-    // lower requires m >= n
-    if ((uplo == slate::Uplo::Lower && m < n) ||
-        (uplo == slate::Uplo::Upper && m > n)) {
-        char buf[255];
-        snprintf( buf, sizeof( buf ), "skipping invalid size: %s, %lld-by-%lld",
-                  uplo2str( uplo ), llong( m ), llong( n ) );
-        params.msg() = buf;
-        return;
-    }
+    auto A_alloc = allocate_test_TrapezoidMatrix<scalar_t>( ref_copy, false, m, n, params );
 
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(m, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
+    auto& A = A_alloc.A;
+    auto& Aref = A_alloc.Aref;
 
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data;
-    if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) {
-        A_data.resize( lldA * nlocA );
-    }
+    slate::generate_matrix(params.matrix, A);
 
-    // todo: work-around to initialize BaseMatrix::num_devices_
-    slate::TrapezoidMatrix<scalar_t> A0(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD);
-
-    slate::TrapezoidMatrix<scalar_t> A;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::TrapezoidMatrix<scalar_t>(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-    }
-    else {
-        // Create SLATE matrix from the ScaLAPACK layout.
-        A = slate::TrapezoidMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
+    if (ref_copy) {
+        copy_matrix( A, Aref );
     }
 
-    slate::generate_matrix( params.matrix, A );
-
     print_matrix("A", A, params);
 
     if (trace)
@@ -123,39 +100,27 @@ void test_trnorm_work(Params& params, bool run)
     // compute and save timing/performance
     params.time() = time;
 
-    #ifdef SLATE_HAVE_SCALAPACK
-        // BLACS/MPI variables
-        blas_int ictxt, p_, q_, myrow_, mycol_;
-        blas_int A_desc[9];
-        blas_int mpi_rank_ = 0, nprocs = 1;
-
-        // initialize BLACS and ScaLAPACK
-        Cblacs_pinfo(&mpi_rank_, &nprocs);
-        slate_assert( mpi_rank_ == mpi_rank );
-        slate_assert(p*q <= nprocs);
-        Cblacs_get(-1, 0, &ictxt);
-        Cblacs_gridinit(&ictxt, "Col", p, q);
-        Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-        slate_assert( p == p_ );
-        slate_assert( q == q_ );
-        slate_assert( myrow == myrow_ );
-        slate_assert( mycol == mycol_ );
-
-        int64_t info;
-        scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info);
-        if (info != 0)
-            printf( "scalapack_descinit info %lld\n", llong( info ) );
-        slate_assert(info == 0);
-
-        if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) {
-            copy( A, &A_data[0], A_desc );
-        }
-
-        if (check || ref) {
+    if (check || ref) {
+        #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK
 
+            // initialize BLACS and ScaLAPACK
+            blas_int ictxt, A_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+
+            auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data;
+
+            if (origin != slate::Origin::ScaLAPACK && !ref_copy) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+                copy(A, &A_data[0], A_desc);
+            }
+
             // allocate work space
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
+            std::vector<real_t> worklantr( std::max(A_alloc.mloc, A_alloc.nloc) );
+
+            // comparison with reference routine from ScaLAPACK
 
             //==================================================
             // Run ScaLAPACK reference routine.
@@ -166,10 +131,6 @@ void test_trnorm_work(Params& params, bool run)
                                     m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
             time = barrier_get_wtime(MPI_COMM_WORLD) - time;
 
-            //A_norm_ref = lapack::lantr(
-            //    norm, A.uplo(), diag,
-            //    m, n, &A_data[0], lldA);
-
             // difference between norms
             real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
             if (norm == slate::Norm::One) {
@@ -182,7 +143,7 @@ void test_trnorm_work(Params& params, bool run)
                 error /= sqrt(m*n);
             }
 
-            if (verbose && mpi_rank == 0) {
+            if (verbose && A.mpiRank() == 0) {
                 printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n",
                        A_norm, A_norm_ref, A_norm_ref - A_norm, error);
             }
@@ -200,147 +161,141 @@ void test_trnorm_work(Params& params, bool run)
 
             // Allow for difference
             params.okay() = (params.error() <= tol);
-        }
 
-        //---------- extended tests
-        if (extended) {
-            // allocate work space
-            std::vector<real_t> worklantr(std::max(mlocA, nlocA));
-
-            // seed all MPI processes the same
-            srand(1234);
-
-            // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
-            // up to 64 tiles total.
-            // Indices may be out-of-bounds if mt or nt is small, so check in loops.
-            int64_t mt = A.mt();
-            int64_t nt = A.nt();
-            std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
-            std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
-            for (size_t k = 0; k < 4; ++k) {
-                i_indices.insert(rand() % mt);
-                j_indices.insert(rand() % nt);
-            }
-            for (auto j : j_indices) {
-                if (j < 0 || j >= nt)
-                    continue;
-                int64_t jb = std::min(n - j*nb, nb);
-                slate_assert(jb == A.tileNb(j));
-
-                for (auto i : i_indices) {
-                    // lower requires i >= j
-                    // upper requires i <= j
-                    if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
-                        continue;
-                    int64_t ib = std::min(m - i*nb, nb);
-                    slate_assert(ib == A.tileMb(i));
-
-                    // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
-                    // up to 25 entries per tile.
-                    // Indices may be out-of-bounds if ib or jb is small, so check in loops.
-                    std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
-                    std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
-
-                    // todo: complex peak
-                    scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
-                    if (rand() < RAND_MAX / 2)
-                        peak *= -1;
-                    if (rand() < RAND_MAX / 20)
-                        peak = nan("");
-                    scalar_t save = 0;
-
-                    for (auto jj : jj_indices) {
-                        if (jj < 0 || jj >= jb)
+            //---------- extended tests
+            if (extended) {
+                if (grid_order != slate::GridOrder::Col) {
+                    printf("WARNING: cannot do extended tests with row-major grid\n");
+                }
+                else {
+                    // seed all MPI processes the same
+                    srand(1234);
+
+                    // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols,
+                    // up to 64 tiles total.
+                    // Indices may be out-of-bounds if mt or nt is small, so check in loops.
+                    int64_t mt = A.mt();
+                    int64_t nt = A.nt();
+                    std::set<int64_t> i_indices = { 0, 1, mt - 2, mt - 1 };
+                    std::set<int64_t> j_indices = { 0, 1, nt - 2, nt - 1 };
+                    for (size_t k = 0; k < 4; ++k) {
+                        i_indices.insert(rand() % mt);
+                        j_indices.insert(rand() % nt);
+                    }
+                    for (auto j : j_indices) {
+                        if (j < 0 || j >= nt)
                             continue;
+                        int64_t jb = std::min(n - j*nb, nb);
+                        slate_assert(jb == A.tileNb(j));
 
-                        for (auto ii : ii_indices) {
-                            if (ii < 0 || ii >= ib
-                                || (i == j && (uplo == slate::Uplo::Lower
-                                               ? ii < jj
-                                               : ii > jj))) {
+                        for (auto i : i_indices) {
+                            // lower requires i >= j
+                            // upper requires i <= j
+                            if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j))
                                 continue;
-                            }
-
-                            int64_t ilocal = int(i / p)*nb + ii;
-                            int64_t jlocal = int(j / q)*nb + jj;
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                save = T(ii, jj);
-                                slate_assert(A_data[ ilocal + jlocal*lldA ] == save);
-                                T.at(ii, jj) = peak;
-                                A_data[ ilocal + jlocal*lldA ] = peak;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
-
-                            A_norm = slate::norm(norm, A, opts);
-
-                            real_t A_norm_ref = scalapack_plantr(
-                                                    norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
-                                                    m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
-
-                            // difference between norms
-                            real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
-                            if (norm == slate::Norm::One) {
-                                error /= sqrt(m);
-                            }
-                            else if (norm == slate::Norm::Inf) {
-                                error /= sqrt(n);
-                            }
-                            else if (norm == slate::Norm::Fro) {
-                                error /= sqrt(m*n);
-                            }
-
-                            // Allow for difference, except max norm in real should be exact.
-                            real_t eps = std::numeric_limits<real_t>::epsilon();
-                            real_t tol;
-                            if (norm == slate::Norm::Max && ! slate::is_complex<scalar_t>::value)
-                                tol = 0;
-                            else
-                                tol = 10*eps;
-
-                            if (mpi_rank == 0) {
-                                // if peak is nan, expect A_norm to be nan,
-                                // except in Unit case with i == j and ii == jj,
-                                // where peak shouldn't affect A_norm.
-                                bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
-                                             ? std::isnan(A_norm)
-                                             : error <= tol);
-                                params.okay() = params.okay() && okay;
-                                if (verbose || ! okay) {
-                                    printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
-                                           llong( i ), llong( j ), llong( ii ), llong( jj ),
-                                           real(peak), A_norm, A_norm_ref, error,
-                                           (okay ? "pass" : "failed"));
+                            int64_t ib = std::min(m - i*nb, nb);
+                            slate_assert(ib == A.tileMb(i));
+
+                            // Test entries in 2x2 in all 4 corners, and 1 other random row and col,
+                            // up to 25 entries per tile.
+                            // Indices may be out-of-bounds if ib or jb is small, so check in loops.
+                            std::set<int64_t> ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib };
+                            std::set<int64_t> jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb };
+
+                            // todo: complex peak
+                            scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6;
+                            if (rand() < RAND_MAX / 2)
+                                peak *= -1;
+                            if (rand() < RAND_MAX / 20)
+                                peak = nan("");
+                            scalar_t save = 0;
+
+                            for (auto jj : jj_indices) {
+                                if (jj < 0 || jj >= jb)
+                                    continue;
+
+                                for (auto ii : ii_indices) {
+                                    if (ii < 0 || ii >= ib
+                                        || (i == j && (uplo == slate::Uplo::Lower
+                                                       ? ii < jj
+                                                       : ii > jj))) {
+                                        continue;
+                                    }
+
+                                    int64_t ilocal = int(i / p)*nb + ii;
+                                    int64_t jlocal = int(j / q)*nb + jj;
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        save = T(ii, jj);
+                                        slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save);
+                                        T.at(ii, jj) = peak;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = peak;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
+
+                                    A_norm = slate::norm(norm, A, opts);
+
+                                    A_norm_ref = scalapack_plantr(
+                                                     norm2str(norm), uplo2str(A.uplo()), diag2str(diag),
+                                                     m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]);
+
+                                    // difference between norms
+                                    error = std::abs(A_norm - A_norm_ref) / A_norm_ref;
+                                    if (norm == slate::Norm::One) {
+                                        error /= sqrt(m);
+                                    }
+                                    else if (norm == slate::Norm::Inf) {
+                                        error /= sqrt(n);
+                                    }
+                                    else if (norm == slate::Norm::Fro) {
+                                        error /= sqrt(m*n);
+                                    }
+
+                                    if (A.mpiRank() == 0) {
+                                        // if peak is nan, expect A_norm to be nan,
+                                        // except in Unit case with i == j and ii == jj,
+                                        // where peak shouldn't affect A_norm.
+                                        bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj)
+                                                     ? std::isnan(A_norm)
+                                                     : error <= tol);
+                                        params.okay() = params.okay() && okay;
+                                        if (verbose || ! okay) {
+                                            printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n",
+                                                   llong( i ), llong( j ), llong( ii ), llong( jj ),
+                                                   real(peak), A_norm, A_norm_ref, error,
+                                                   (okay ? "pass" : "failed"));
+                                        }
+                                    }
+
+                                    if (A.tileIsLocal(i, j)) {
+                                        A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
+                                        auto T = A(i, j);
+                                        T.at(ii, jj) = save;
+                                        A_data[ ilocal + jlocal*A_alloc.lld ] = save;
+                                        // todo: this move shouldn't be required -- the trnorm should copy data itself.
+                                        A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
+                                    }
                                 }
                             }
-
-                            if (A.tileIsLocal(i, j)) {
-                                A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor);
-                                auto T = A(i, j);
-                                T.at(ii, jj) = save;
-                                A_data[ ilocal + jlocal*lldA ] = save;
-                                // todo: this move shouldn't be required -- the trnorm should copy data itself.
-                                A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor);
-                            }
                         }
                     }
                 }
             }
-        }
-
-        Cblacs_gridexit(ictxt);
-        //Cblacs_exit(1) does not handle re-entering
-    #else  // not SLATE_HAVE_SCALAPACK
-        SLATE_UNUSED( A_norm );
-        SLATE_UNUSED( check );
-        SLATE_UNUSED( ref );
-        SLATE_UNUSED( extended );
-        SLATE_UNUSED( verbose );
-        if (mpi_rank == 0)
-            printf( "ScaLAPACK not available\n" );
-    #endif
+
+            Cblacs_gridexit(ictxt);
+            //Cblacs_exit(1) does not handle re-entering
+        #else  // not SLATE_HAVE_SCALAPACK
+            SLATE_UNUSED( A_norm );
+            SLATE_UNUSED( check );
+            SLATE_UNUSED( ref );
+            SLATE_UNUSED( extended );
+            SLATE_UNUSED( verbose );
+            if (A.mpiRank() == 0)
+                printf( "ScaLAPACK not available\n" );
+        #endif
+    }
 }
 
 // -----------------------------------------------------------------------------
diff --git a/test/test_trsm.cc b/test/test_trsm.cc
index 24e3d3360..d1b8affe4 100644
--- a/test/test_trsm.cc
+++ b/test/test_trsm.cc
@@ -6,12 +6,13 @@
 #include "slate/slate.hh"
 #include "test.hh"
 #include "blas/flops.hh"
+#include "print_matrix.hh"
+
+#include "matrix_utils.hh"
+#include "test_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
-#include "print_matrix.hh"
-#include "grid_utils.hh"
 
 #include <cmath>
 #include <cstdio>
@@ -44,9 +45,6 @@ void test_trsm_work(Params& params, bool run)
     int64_t m = params.dim.m();
     int64_t n = params.dim.n();
     scalar_t alpha = params.alpha.get<scalar_t>();
-    int p = params.grid.m();
-    int q = params.grid.n();
-    int64_t nb = params.nb();
     int64_t lookahead = params.lookahead();
     slate::Norm norm = params.norm();
     bool check = params.check() == 'y';
@@ -58,6 +56,9 @@ void test_trsm_work(Params& params, bool run)
     params.matrix.mark();
     params.matrixB.mark();
 
+    mark_params_for_test_TriangularMatrix( params );
+    mark_params_for_test_Matrix( params );
+
     // mark non-standard output values
     params.time();
     params.gflops();
@@ -72,6 +73,11 @@ void test_trsm_work(Params& params, bool run)
         return;
     }
 
+    // Check for common invalid combinations
+    if (is_invalid_parameters( params )) {
+        return;
+    }
+
     slate::Options const opts =  {
         {slate::Option::Lookahead, lookahead},
         {slate::Option::Target, target},
@@ -87,49 +93,12 @@ void test_trsm_work(Params& params, bool run)
     int64_t Bm  = m;
     int64_t Bn  = n;
 
-    // MPI variables
-    int mpi_rank, myrow, mycol;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-    gridinfo(mpi_rank, p, q, &myrow, &mycol);
-
-    // Matrix A: figure out local size.
-    int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p);
-    int64_t nlocA = num_local_rows_cols(An, nb, mycol, q);
-    int64_t lldA  = blas::max(1, mlocA); // local leading dimension of A
-
-    // Matrix B: figure out local size.
-    int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p);
-    int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q);
-    int64_t lldB  = blas::max(1, mlocB); // local leading dimension of B
-
-    // Allocate ScaLAPACK data if needed.
-    std::vector<scalar_t> A_data, B_data;
-    if (ref || origin == slate::Origin::ScaLAPACK) {
-        A_data.resize( lldA * nlocA );
-    }
+    auto A_alloc = allocate_test_TriangularMatrix<scalar_t>( false, true, An, params );
+    auto B_alloc = allocate_test_Matrix<scalar_t>( check || ref, true, Bm, Bn, params );
 
-    slate::TriangularMatrix<scalar_t> A;
-    slate::Matrix<scalar_t> B;
-    if (origin != slate::Origin::ScaLAPACK) {
-        // SLATE allocates CPU or GPU tiles.
-        slate::Target origin_target = origin2target(origin);
-        A = slate::TriangularMatrix<scalar_t>(
-                uplo, diag, An, nb, p, q, MPI_COMM_WORLD);
-        A.insertLocalTiles(origin_target);
-
-        B = slate::Matrix<scalar_t>(
-                Bm, Bn, nb, p, q, MPI_COMM_WORLD);
-        B.insertLocalTiles(origin_target);
-    }
-    else {
-        // create SLATE matrices from the ScaLAPACK layouts
-        A = slate::TriangularMatrix<scalar_t>::fromScaLAPACK(
-                uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD);
-
-        B_data.resize( lldB * nlocB );
-        B = slate::Matrix<scalar_t>::fromScaLAPACK(
-                Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
-    }
+    auto& A         = A_alloc.A;
+    auto& B         = B_alloc.A;
+    auto& Bref      = B_alloc.Aref;
 
     slate::generate_matrix( params.matrix, A );
     slate::generate_matrix( params.matrixB, B );
@@ -140,13 +109,8 @@ void test_trsm_work(Params& params, bool run)
     auto AH = slate::HermitianMatrix<scalar_t>( A );
     slate::potrf( AH, opts );
 
-    // if check is required, copy test data
-    std::vector< scalar_t > Bref_data;
-    slate::Matrix<scalar_t> Bref;
+    // If reference run is required, record norms to be used in the check/ref.
     if (check || ref) {
-        Bref_data.resize( lldB * nlocB );
-        Bref = slate::Matrix<scalar_t>::fromScaLAPACK(
-                   Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD);
         slate::copy( B, Bref );
     }
 
@@ -220,34 +184,22 @@ void test_trsm_work(Params& params, bool run)
         #ifdef SLATE_HAVE_SCALAPACK
             // comparison with reference routine from ScaLAPACK for timing only
 
-            // BLACS/MPI variables
-            blas_int ictxt, p_, q_, myrow_, mycol_;
-            blas_int A_desc[9], B_desc[9], Bref_desc[9];
-            blas_int mpi_rank_ = 0, nprocs = 1;
-
             // initialize BLACS and ScaLAPACK
-            Cblacs_pinfo(&mpi_rank_, &nprocs);
-            slate_assert( mpi_rank_ == mpi_rank );
-            slate_assert(p*q <= nprocs);
-            Cblacs_get(-1, 0, &ictxt);
-            Cblacs_gridinit(&ictxt, "Col", p, q);
-            Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_);
-            slate_assert( p == p_ );
-            slate_assert( q == q_ );
-            slate_assert( myrow == myrow_ );
-            slate_assert( mycol == mycol_ );
-
-            int64_t info;
-            scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info);
-            slate_assert(info == 0);
-
-            copy( A, &A_data[0], A_desc );
+            blas_int ictxt, A_desc[9], Bref_desc[9];
+            A_alloc.create_ScaLAPACK_context( &ictxt );
+
+            A_alloc.ScaLAPACK_descriptor( ictxt, A_desc );
+            B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc );
+
+            auto& A_data = A_alloc.A_data;
+            auto& Bref_data = B_alloc.Aref_data;
+
+            if (origin != slate::Origin::ScaLAPACK) {
+                A_data.resize( A_alloc.lld * A_alloc.nloc );
+
+                // Copy SLATE matrix into ScaLAPACK matrix
+                copy(A, &A_data[0], A_desc);
+            }
 
             //==================================================
             // Run ScaLAPACK reference routine.
diff --git a/test/test_trtri.cc b/test/test_trtri.cc
index 2cd7f6daf..e0c0d9435 100644
--- a/test/test_trtri.cc
+++ b/test/test_trtri.cc
@@ -9,7 +9,6 @@
 #include "lapack/flops.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 #include "print_matrix.hh"
 #include "grid_utils.hh"
diff --git a/test/test_unmqr.cc b/test/test_unmqr.cc
index d7270b303..87eff3ecf 100644
--- a/test/test_unmqr.cc
+++ b/test/test_unmqr.cc
@@ -11,7 +11,6 @@
 #include "grid_utils.hh"
 
 #include "scalapack_wrappers.hh"
-#include "scalapack_support_routines.hh"
 #include "scalapack_copy.hh"
 
 #include <cmath>
diff --git a/test/test_utils.hh b/test/test_utils.hh
index a946411b0..bdae1d494 100644
--- a/test/test_utils.hh
+++ b/test/test_utils.hh
@@ -7,8 +7,59 @@
 #define SLATE_TEST_UTILS_HH
 
 #include "slate/slate.hh"
+#include "test.hh"
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+/// Checks for common invalid parameter combinations
+///
+/// @return true if the configuration should be skipped
+///
+inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = false)
+{
+    slate::Origin origin = params.origin();
+    slate::Target target = params.target();
+    slate::GridOrder dev_order = params.dev_order();
+    bool nonuniform_nb = params.nonuniform_nb() == 'y';
+
+    if (target != slate::Target::Devices && dev_order == slate::GridOrder::Col) {
+        params.msg() = "skipping: dev_order = Col applies only to target devices";
+        return true;
+    }
+
+    if (dev_order == slate::GridOrder::Col && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: dev_order = Col not supported with origin=ScaLAPACK";
+        return true;
+    }
+
+    if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) {
+        params.msg() = "skipping: nonuniform tile not supported with origin=ScaLAPACK";
+        return true;
+    }
+
+    #ifdef SLATE_HAVE_SCALAPACK
+        if (!keep_nonuniform_ref && nonuniform_nb && params.ref() != 'n') {
+            params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK";
+            if (params.ref() == 'o') {
+                // If ref=='o', the user doesn't want to run SLATE version
+                return true;
+            }
+            else {
+                params.ref() = 'n';
+            }
+        }
+    #else
+        // Can only run ref when we have ScaLAPACK
+        if (params.ref()) {
+            params.msg() = "skipping reference: ScaLAPACK not available";
+            params.ref() = false;
+        }
+    #endif
+
+
+    return false;
+}
+
+//------------------------------------------------------------------------------
 /// Applies the operator thunk to each element of A and B to update B.
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the
@@ -29,19 +80,25 @@ void matrix_iterator(
 
     int64_t B_mt = B.mt();
     int64_t B_nt = B.nt();
-    if constexpr (std::is_same<matrix_type, slate::Matrix<scalar_t>>::value) {
 
-        int64_t A_j = 0, A_jj = 0;
-        for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
+    constexpr bool is_general = std::is_same_v<matrix_type, slate::Matrix<scalar_t>>;
+    assert( is_general == (A.uplo() == slate::Uplo::General) );
+    bool is_upper = (A.uplo() == slate::Uplo::Upper);
+
+    int64_t A_j = 0, A_jj = 0;
+    for (int64_t B_j = 0; B_j < B_nt; ++B_j) {
 
-            int64_t A_i = 0, A_ii = 0;
-            for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+        int64_t A_i = 0, A_ii = 0;
+        for (int64_t B_i = 0; B_i < B_mt; ++B_i) {
+
+            if (is_general || (is_upper ? (B_i <= B_j) : (B_i >= B_j))) {
                 #pragma omp task shared(A, B) \
                                  firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj )
                 {
+                    int tag = A_i + A_j * A.mt();
                     if (B.tileIsLocal( B_i, B_j )) {
                         A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ),
-                                    slate::Layout::ColMajor );
+                                    slate::Layout::ColMajor, tag );
 
                         A.tileGetForReading( A_i, A_j, ColMajor );
                         B.tileGetForWriting( B_i, B_j, ColMajor );
@@ -56,40 +113,44 @@ void matrix_iterator(
                         scalar_t const* TA_data = TA.data();
                         scalar_t*       TB_data = TB.data();
                         for (int64_t jj = 0; jj < nb; ++jj) {
-                            for (int64_t ii = 0; ii < mb; ++ii) {
+                            int64_t ii_start = 0, ii_end = mb;
+                            if (!is_general && B_i == B_j) { // diagonal tile
+                                if (is_upper)
+                                    ii_end = std::min(jj+1, mb);
+                                else
+                                    ii_start = jj;
+                            }
+                            for (int64_t ii = ii_start; ii < ii_end; ++ii) {
                                 thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ],
                                        TB_data[ ii + jj*ldb ] );
                             }
                         }
                     }
                     else if (A.tileIsLocal( A_i, A_j )) {
-                        A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) );
+                        A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag );
                     }
                 }
 
-                A_ii += B.tileMb( B_i );
-                assert( A_ii <= A.tileMb( A_i ) );
-                if (A_ii == A.tileMb( A_i )) {
-                    ++A_i;
-                    A_ii = 0;
-                }
             }
-            A_jj += B.tileNb( B_j );
-            assert( A_jj <= A.tileNb( A_j ) );
-            if (A_jj == A.tileNb( A_j )) {
-                ++A_j;
-                A_jj = 0;
+            A_ii += B.tileMb( B_i );
+            assert( A_ii <= A.tileMb( A_i ) );
+            if (A_ii == A.tileMb( A_i )) {
+                ++A_i;
+                A_ii = 0;
             }
         }
-    }
-    else {
-        assert( false );
+        A_jj += B.tileNb( B_j );
+        assert( A_jj <= A.tileNb( A_j ) );
+        if (A_jj == A.tileNb( A_j )) {
+            ++A_j;
+            A_jj = 0;
+        }
     }
 
     A.releaseRemoteWorkspace();
 }
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// subtract_matrices takes input matrices A and B, and performs B = B - A.
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the
@@ -103,7 +164,7 @@ void subtract_matrices( matrix_type& A, matrix_type& B )
     matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } );
 }
 
-///-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 /// copy_matrix copies A to B
 /// The matrices must have the same size, but can have different tile sizes and
 /// distributions. However, the elements of a tile of B must all belong to the