From 3d6965eb88dfe53b2e7cb82196e023136db755b1 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 10:01:53 -0500 Subject: [PATCH 01/33] Add nonuniform nb support to test_posv --- test/matrix_utils.hh | 121 +++++++++++++++++++++++++++++++++++++++ test/test.hh | 6 +- test/test_posv.cc | 132 ++++++++----------------------------------- 3 files changed, 148 insertions(+), 111 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 7d7f44a96..750afddba 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -349,4 +349,125 @@ TestMatrix> allocate_test_Matrix( return matrix; } +// ----------------------------------------------------------------------------- +/// Marks the paramters used by allocate_test_HermitianMatrix +inline void mark_params_for_test_HermitianMatrix(Params& params) +{ + params.grid.m(); + params.grid.n(); + params.dev_dist(); + params.uplo(); + params.nb(); + params.nonuniform_nb(); + params.origin(); + params.grid_order(); +} + +// ----------------------------------------------------------------------------- +/// Allocates a HermitianMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + int64_t p = params.grid.m(); + int64_t q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix> matrix; + matrix.m = n; + matrix.n = n; + + // ScaLAPACK variables for reference matrix + int mpi_rank, myrow, mycol; + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + matrix.nb = nb; + matrix.mloc = num_local_rows_cols( n, nb, myrow, p ); + matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); + matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A + + // Functions for nonuniform tile sizes or row device distribution + std::function< int64_t (int64_t j) > tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + } + else { + tileNb = slate::func::uniform_blocksize( n, nb ); + } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + if (nonuniform_nb || dev_dist == slate::Dist::Row) { + matrix.A = slate::HermitianMatrix( + uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = slate::HermitianMatrix( + uplo, n, nb, p, q, MPI_COMM_WORLD); + } + + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target(origin); + matrix.A.insertLocalTiles(origin_target); + } + else { + assert( !nonuniform_nb ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = slate::HermitianMatrix::fromScaLAPACK( + uplo, n, &matrix.A_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.A = slate::HermitianMatrix( + uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = slate::HermitianMatrix::fromScaLAPACK( + uplo, n, &matrix.Aref_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD); + } + } + + return matrix; +} + + #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/test.hh b/test/test.hh index b0d48e0f3..0c62fb9f0 100644 --- a/test/test.hh +++ b/test/test.hh @@ -27,9 +27,9 @@ enum class Origin { Devices, }; -enum class Dist { - Row, - Col, +enum class Dist : char { + Row = 'R', + Col = 'C', }; } // namespace slate diff --git a/test/test_posv.cc b/test/test_posv.cc index 46f3c3cc6..4803fa137 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -8,12 +8,13 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "auxiliary/Debug.hh" -#include "grid_utils.hh" #include #include @@ -46,6 +47,7 @@ void test_posv_work(Params& params, bool run) int timer_level = params.timer_level(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); slate::Dist dev_dist = params.dev_dist(); params.matrix.mark(); params.matrixB.mark(); @@ -158,95 +160,31 @@ void test_posv_work(Params& params, bool run) int64_t info = 0; - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // ScaLAPACK data if needed. - std::vector A_data, B_data; - - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::HermitianMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); - - slate::HermitianMatrix A; - slate::Matrix B, X; - std::vector X_data; - if (origin != slate::Origin::ScaLAPACK) { - if (dev_dist == slate::Dist::Row && target == slate::Target::Devices) { - // slate_assert(target == slate::Target::Devices); - // todo: doesn't work when lookahead is greater than 2 - // slate_assert(lookahead < 3); - - auto tileNb = slate::func::uniform_blocksize( n, nb ); - auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, - p, q ); - int num_devices = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, - p, num_devices ); - - A = slate::HermitianMatrix( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - B = slate::Matrix( - n, nrhs, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - A = slate::HermitianMatrix( - uplo, n, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix( - n, nrhs, nb, p, q, MPI_COMM_WORLD); - } - - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A.insertLocalTiles(origin_target); - - B.insertLocalTiles(origin_target); - - if (is_iterative) { - X_data.resize(lldB*nlocB); - X = slate::Matrix(n, nrhs, nb, p, q, MPI_COMM_WORLD); - X.insertLocalTiles(origin_target); - } - } - else { - // Create SLATE matrix from the ScaLAPACK layouts - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - n, nrhs, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - if (is_iterative) { - X_data.resize(lldB*nlocB); - X = slate::Matrix::fromScaLAPACK( - n, nrhs, &X_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto A_alloc = allocate_test_HermitianMatrix( check || ref, true, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, true, n, nrhs, params ); + TestMatrix> X_alloc; + if (is_iterative) { + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); } + auto& A = A_alloc.A; + auto& A_data = A_alloc.A_data; + auto& Aref = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + auto& B = B_alloc.A; + auto& B_data = B_alloc.A_data; + auto& Bref = B_alloc.Aref; + auto& Bref_data = B_alloc.Aref_data; + auto& X = X_alloc.A; + slate::generate_matrix(params.matrix, A); slate::generate_matrix(params.matrixB, B); print_matrix("A", A, params); print_matrix("B", B, params); // if check is required, copy test data and create a descriptor for it - std::vector Aref_data(lldA*nlocA); - std::vector Bref_data(lldB*nlocB); std::vector B_orig; - slate::HermitianMatrix Aref; - slate::Matrix Bref; if (check || ref) { - // SLATE matrix wrappers for the reference data - Aref = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Bref = slate::Matrix::fromScaLAPACK( - n, nrhs, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - slate::copy( A, Aref ); slate::copy( B, Bref ); @@ -414,34 +352,12 @@ void test_posv_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], Aref_desc[9]; - blas_int B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Aref_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + blas_int ictxt, Aref_desc[9], Bref_desc[9]; + create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); if (check) { // restore Bref_data @@ -477,9 +393,9 @@ void test_posv_work(Params& params, bool run) if (verbose > 2) { if (origin == slate::Origin::ScaLAPACK) { - slate::Debug::diffLapackMatrices(n, n, &A_data[0], lldA, &Aref_data[0], lldA, nb, nb); + slate::Debug::diffLapackMatrices(n, n, &A_data[0], A_alloc.lld, &Aref_data[0], A_alloc.lld, nb, nb); if (params.routine != "potrf") { - slate::Debug::diffLapackMatrices(n, nrhs, &B_data[0], lldB, &Bref_data[0], lldB, nb, nb); + slate::Debug::diffLapackMatrices(n, nrhs, &B_data[0], A_alloc.lld, &Bref_data[0], A_alloc.lld, nb, nb); } } } From ac36b67efbad71e0726a36c6c2c3c2b452e1f927 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 10:33:36 -0500 Subject: [PATCH 02/33] Refactor out some duplicate code for TestMatrix class --- test/matrix_utils.hh | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 750afddba..ff780cc81 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -214,6 +214,20 @@ class TestMatrix { using scalar_t = typename MatrixType::value_type; public: + TestMatrix() {} + + TestMatrix(int64_t m_in, int64_t n_in, int64_t nb_in, + int64_t p, int64_t q, slate::GridOrder grid_order) + : m(m_in), n(n_in), nb(nb_in) + { + int mpi_rank, myrow, mycol; + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + this->mloc = num_local_rows_cols( m_in, nb, myrow, p ); + this->nloc = num_local_rows_cols( n_in, nb, mycol, q ); + this->lld = blas::max( 1, mloc ); // local leading dimension of A + } + // SLATE matrices MatrixType A; MatrixType Aref; @@ -282,18 +296,7 @@ TestMatrix> allocate_test_Matrix( slate::GridOrder grid_order = params.grid_order(); // The object to be returned - TestMatrix> matrix; - matrix.m = m; - matrix.n = n; - - // ScaLAPACK variables for reference matrix - int mpi_rank, myrow, mycol; - MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - matrix.nb = nb; - matrix.mloc = num_local_rows_cols( m, nb, myrow, p ); - matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); - matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A + TestMatrix> matrix( m, n, nb, p, q, grid_order ); // Functions for nonuniform tile sizes. // Odd-numbered tiles are 2*nb, even-numbered tiles are nb. @@ -400,18 +403,7 @@ TestMatrix> allocate_test_HermitianMatrix( slate::GridOrder grid_order = params.grid_order(); // The object to be returned - TestMatrix> matrix; - matrix.m = n; - matrix.n = n; - - // ScaLAPACK variables for reference matrix - int mpi_rank, myrow, mycol; - MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - matrix.nb = nb; - matrix.mloc = num_local_rows_cols( n, nb, myrow, p ); - matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); - matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A + TestMatrix> matrix ( n, n, nb, p, q, grid_order ); // Functions for nonuniform tile sizes or row device distribution std::function< int64_t (int64_t j) > tileNb; From 72f871d02d96b464b0821648f135184d8db1cf36 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 12:05:57 -0500 Subject: [PATCH 03/33] Add support for dev_dist to test_gesv --- test/matrix_utils.hh | 29 +++++++++++---- test/run_tests.py | 84 ++++++++++++++++++++++---------------------- test/test_gesv.cc | 6 ++++ test/test_posv.cc | 20 +++++++---- 4 files changed, 84 insertions(+), 55 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index ff780cc81..9ee1b9c03 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -254,6 +254,7 @@ inline void mark_params_for_test_Matrix(Params& params) { params.grid.m(); params.grid.n(); + params.dev_dist(); params.nb(); params.nonuniform_nb(); params.origin(); @@ -290,6 +291,7 @@ TestMatrix> allocate_test_Matrix( // Load params variables int64_t p = params.grid.m(); int64_t q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); int64_t nb = params.nb(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; slate::Origin origin = params.origin(); @@ -300,13 +302,21 @@ TestMatrix> allocate_test_Matrix( // Functions for nonuniform tile sizes. // Odd-numbered tiles are 2*nb, even-numbered tiles are nb. - std::function< int64_t (int64_t j) > - tileNb = [nb](int64_t j) { - return (j % 2 != 0 ? nb*2 : nb); - }; + std::function< int64_t (int64_t j) > tileMb, tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + tileMb = tileNb; + } + else { + tileMb = slate::func::uniform_blocksize( m, nb ); + tileNb = slate::func::uniform_blocksize( n, nb ); + } auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), p, num_devices_ ); // Setup matrix to test SLATE with @@ -316,7 +326,9 @@ TestMatrix> allocate_test_Matrix( if (nonuniform_nb) { params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) ) + ", " + std::to_string( tileNb( 1 ) ); - matrix.A = slate::Matrix( m, n, tileNb, tileNb, tileRank, + } + if (nonuniform_nb || dev_dist == slate::Dist::Col) { + matrix.A = slate::Matrix( m, n, tileMb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); } else { @@ -327,6 +339,7 @@ TestMatrix> allocate_test_Matrix( } else { assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); // Create SLATE matrix from the ScaLAPACK layouts matrix.A_data.resize( matrix.lld * matrix.nloc ); matrix.A = slate::Matrix::fromScaLAPACK( @@ -337,6 +350,7 @@ TestMatrix> allocate_test_Matrix( // Setup reference matrix if (ref_matrix) { if (nonuniform_nb && nonuniform_ref) { + std::cout << "Using nonuniform Bref" << std::endl; matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD ); matrix.Aref.insertLocalTiles( slate::Target::Host ); @@ -423,7 +437,7 @@ TestMatrix> allocate_test_HermitianMatrix( // Setup matrix to test SLATE with if (origin != slate::Origin::ScaLAPACK) { - if (nonuniform_nb || dev_dist == slate::Dist::Row) { + if (nonuniform_nb || dev_dist == slate::Dist::Col) { matrix.A = slate::HermitianMatrix( uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); } @@ -438,6 +452,7 @@ TestMatrix> allocate_test_HermitianMatrix( } else { assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); // Create SLATE matrix from the ScaLAPACK layouts matrix.A_data.resize( matrix.lld * matrix.nloc ); matrix.A = slate::HermitianMatrix::fromScaLAPACK( diff --git a/test/run_tests.py b/test/run_tests.py index 4561e217f..7bc730139 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -348,9 +348,9 @@ def filter_csv( values, csv ): cmds += [ [ 'gbmm', gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ], - [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], - [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], - [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], + [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], + [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], + [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], [ 'hemm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], # todo: hemmA GPU support @@ -386,29 +386,29 @@ def filter_csv( values, csv ): # LU if (opts.lu): cmds += [ - [ 'gesv', gen + dtype + la + n + thresh + nonuniform_nb ], - [ 'gesv_tntpiv', gen + dtype + la + n ], - [ 'gesv_nopiv', gen + dtype + la + n + nonuniform_nb + [ 'gesv', gen + dtype + la + n + thresh + nonuniform_nb + ddist ], + [ 'gesv_tntpiv', gen + dtype + la + n + ddist ], + [ 'gesv_nopiv', gen + dtype + la + n + nonuniform_nb + ddist + ' --matrix rand_dominant' ], # todo: mn - [ 'getrf', gen + dtype + la + n + thresh + nonuniform_nb ], - [ 'getrf_tntpiv', gen + dtype + la + n ], - [ 'getrf_nopiv', gen + dtype + la + n + nonuniform_nb + [ 'getrf', gen + dtype + la + n + thresh + nonuniform_nb + ddist ], + [ 'getrf_tntpiv', gen + dtype + la + n + ddist ], + [ 'getrf_nopiv', gen + dtype + la + n + nonuniform_nb + ddist + ' --matrix rand_dominant' ], - [ 'getrs', gen + dtype + la + n + trans + thresh + nonuniform_nb ], - [ 'getrs_tntpiv', gen + dtype + la + n + trans ], - [ 'getrs_nopiv', gen + dtype + la + n + trans + nonuniform_nb + [ 'getrs', gen + dtype + la + n + trans + thresh + nonuniform_nb + ddist ], + [ 'getrs_tntpiv', gen + dtype + la + n + trans + ddist ], + [ 'getrs_nopiv', gen + dtype + la + n + trans + nonuniform_nb + ddist + ' --matrix rand_dominant' ], - [ 'getri', gen + dtype + la + n ], - [ 'getriOOP', gen + dtype + la + n ], + [ 'getri', gen + dtype + la + n + ddist ], + [ 'getriOOP', gen + dtype + la + n + ddist ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], - [ 'gesv_mixed', gen + dtype_double + la + n + nonuniform_nb ], - [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ], - [ 'gesv_rbt', gen + dtype + la + n ], + [ 'gesv_mixed', gen + dtype_double + la + n + nonuniform_nb + ddist ], + [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb + ddist ], + [ 'gesv_rbt', gen + dtype + la + n + ddist ], ] # LU banded @@ -628,31 +628,31 @@ def filter_csv( values, csv ): # aux if (opts.aux): cmds += [ - [ 'add', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'tradd', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syadd', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'headd', gen + dtype + n + ab + nonuniform_nb + uplo ], - - [ 'copy', gen + dtype + mn + nonuniform_nb ], - [ 'tzcopy', gen + dtype + mn + nonuniform_nb + uplo ], - [ 'trcopy', gen + dtype + n + nonuniform_nb + uplo ], - [ 'sycopy', gen + dtype + n + nonuniform_nb + uplo ], - [ 'hecopy', gen + dtype + n + nonuniform_nb + uplo ], - - [ 'scale', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'trscale', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syscale', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'hescale', gen + dtype + n + ab + nonuniform_nb + uplo ], - - [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ], - - [ 'set', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'trset', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syset', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'heset', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'add', gen + dtype + mn + ab + nonuniform_nb + ddist ], + [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], + [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'syadd', gen + dtype + n + ab + nonuniform_nb + ddist + ddist + uplo ], + [ 'headd', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + + [ 'copy', gen + dtype + mn + nonuniform_nb + ddist ], + [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ddist + uplo ], + [ 'trcopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], + [ 'sycopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], + [ 'hecopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], + + [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ddist ], + [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], + [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'syscale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'hescale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + + [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist ], + + [ 'set', gen + dtype + mn + ab + nonuniform_nb + ddist ], + [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], + [ 'trset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'syset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'heset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], ] # ------------------------------------------------------------------------------ diff --git a/test/test_gesv.cc b/test/test_gesv.cc index a19beadb8..db7a1a49d 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -75,6 +75,7 @@ void test_gesv_work(Params& params, bool run) slate::Origin origin = params.origin(); slate::Target target = params.target(); slate::GridOrder grid_order = params.grid_order(); + slate::Dist dev_dist = params.dev_dist(); params.matrix.mark(); params.matrixB.mark(); @@ -156,6 +157,11 @@ void test_gesv_work(Params& params, bool run) if (! run) return; + if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { + params.msg() = "skipping: dev_dist = Row applies only to target devices"; + return; + } + if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; return; diff --git a/test/test_posv.cc b/test/test_posv.cc index 4803fa137..1532f9193 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -42,6 +42,7 @@ void test_posv_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; bool hold_local_workspace = params.hold_local_workspace() == 'y'; int verbose = params.verbose(); int timer_level = params.timer_level(); @@ -136,13 +137,18 @@ void test_posv_work(Params& params, bool run) {slate::Option::UseFallbackSolver, fallback}, }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { - params.msg() = "skipping: dev_dist = Row applies only to target devices"; + params.msg() = "skipping: dev_dist = Col applies only to target devices"; + return; + } + + if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK"; + return; + } + + if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; return; } @@ -403,6 +409,8 @@ void test_posv_work(Params& params, bool run) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( verbose ); + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); #endif From 9dfcbd09ef45d22b2c887c691da00727618fe931 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 12:22:10 -0500 Subject: [PATCH 04/33] Refactor out some common error checks in the tester --- test/test_add.cc | 6 ++++++ test/test_copy.cc | 6 ++++++ test/test_gemm.cc | 7 +++++++ test/test_gesv.cc | 15 +++++---------- test/test_posv.cc | 32 +++++++++++++------------------- test/test_scale.cc | 6 ++++++ test/test_scale_row_col.cc | 6 ++++++ test/test_set.cc | 6 ++++++ test/test_utils.hh | 30 ++++++++++++++++++++++++++++++ 9 files changed, 85 insertions(+), 29 deletions(-) diff --git a/test/test_add.cc b/test/test_add.cc index 3e51600b3..d5224e5a9 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -10,6 +10,7 @@ #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" @@ -67,6 +68,11 @@ void test_add_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; diff --git a/test/test_copy.cc b/test/test_copy.cc index 84721bb9d..ffeb68e9e 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -10,6 +10,7 @@ #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" @@ -65,6 +66,11 @@ void test_copy_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 746e2f74d..0e2a56ac9 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -7,8 +7,10 @@ #include "test.hh" #include "blas/flops.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" @@ -86,6 +88,11 @@ void test_gemm_work(Params& params, bool run) } #endif + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, diff --git a/test/test_gesv.cc b/test/test_gesv.cc index db7a1a49d..737dca144 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -8,8 +8,10 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" @@ -72,10 +74,8 @@ void test_gesv_work(Params& params, bool run) int verbose = params.verbose(); int timer_level = params.timer_level(); SLATE_UNUSED(verbose); - slate::Origin origin = params.origin(); slate::Target target = params.target(); slate::GridOrder grid_order = params.grid_order(); - slate::Dist dev_dist = params.dev_dist(); params.matrix.mark(); params.matrixB.mark(); @@ -154,18 +154,13 @@ void test_gesv_work(Params& params, bool run) depth = params.depth(); } - if (! run) - return; - - if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { - params.msg() = "skipping: dev_dist = Row applies only to target devices"; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { return; } - if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; + if (! run) return; - } if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres") && ! std::is_same::value) { diff --git a/test/test_posv.cc b/test/test_posv.cc index 1532f9193..f50a04fe2 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -8,12 +8,12 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" -#include "scalapack_copy.hh" #include "auxiliary/Debug.hh" #include @@ -49,12 +49,13 @@ void test_posv_work(Params& params, bool run) slate::Origin origin = params.origin(); slate::Target target = params.target(); slate::GridOrder grid_order = params.grid_order(); - slate::Dist dev_dist = params.dev_dist(); params.matrix.mark(); params.matrixB.mark(); slate::Method methodTrsm = params.method_trsm(); slate::Method methodHemm = params.method_hemm(); + mark_params_for_test_HermitianMatrix( params ); + // Currently only posv* supports timer_level >= 2. std::vector timer_lvl_support{ "posv", "posv_mixed", "posv_mixed_gmres" }; @@ -127,6 +128,11 @@ void test_posv_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -137,21 +143,6 @@ void test_posv_work(Params& params, bool run) {slate::Option::UseFallbackSolver, fallback}, }; - if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { - params.msg() = "skipping: dev_dist = Col applies only to target devices"; - return; - } - - if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK"; - return; - } - - if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; - return; - } - if ((params.routine == "posv_mixed" || params.routine == "posv_mixed_gmres") && ! std::is_same::value) { params.msg() = "skipping: unsupported mixed precision; must be type=d or z"; @@ -357,7 +348,10 @@ void test_posv_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only - // BLACS/MPI variables + if (nonuniform_nb) { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + return; + } // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; diff --git a/test/test_scale.cc b/test/test_scale.cc index a98ffcb2d..b5c99c324 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -10,6 +10,7 @@ #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" @@ -67,6 +68,11 @@ void test_scale_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index 7c1ed4ffd..ea9ce0da6 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -10,6 +10,7 @@ #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" @@ -68,6 +69,11 @@ void test_scale_row_col_work( Params& params, bool run ) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; diff --git a/test/test_set.cc b/test/test_set.cc index 2dc088f0a..78c0a82e1 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -10,6 +10,7 @@ #include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" @@ -67,6 +68,11 @@ void test_set_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; diff --git a/test/test_utils.hh b/test/test_utils.hh index a946411b0..67a3d92ee 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -8,6 +8,36 @@ #include "slate/slate.hh" +///----------------------------------------------------------------------------- +/// Checks for common invalid parameter combinations +/// +/// @return true if the configuration should be skipped +/// +inline bool is_invalid_parameters(Params& params) +{ + slate::Origin origin = params.origin(); + slate::Target target = params.target(); + slate::Dist dev_dist = params.dev_dist(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + + if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { + params.msg() = "skipping: dev_dist = Col applies only to target devices"; + return true; + } + + if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK"; + return true; + } + + if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; + return true; + } + + return false; +} + ///----------------------------------------------------------------------------- /// Applies the operator thunk to each element of A and B to update B. /// The matrices must have the same size, but can have different tile sizes and From 8460bb8ef9d172587c9bec07f43d03b6381216ec Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 12:47:50 -0500 Subject: [PATCH 05/33] Allow TestMatrix to create ScaLAPACK contexts to reduce duplication --- test/matrix_utils.hh | 30 ++++++++++++++++++++---------- test/test_add.cc | 4 +--- test/test_copy.cc | 4 +--- test/test_gemm.cc | 5 +---- test/test_gesv.cc | 5 +---- test/test_posv.cc | 5 +---- test/test_scale.cc | 4 +--- test/test_scale_row_col.cc | 5 +++-- test/test_set.cc | 4 +--- 9 files changed, 30 insertions(+), 36 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 9ee1b9c03..18a007f14 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -9,6 +9,7 @@ #include "slate/slate.hh" #include "scalapack_wrappers.hh" +#include "grid_utils.hh" //------------------------------------------------------------------------------ // Zero out B, then copy band matrix B from A. @@ -216,15 +217,15 @@ class TestMatrix { public: TestMatrix() {} - TestMatrix(int64_t m_in, int64_t n_in, int64_t nb_in, - int64_t p, int64_t q, slate::GridOrder grid_order) - : m(m_in), n(n_in), nb(nb_in) + TestMatrix(int64_t m_, int64_t n_, int64_t nb_, + int p_, int q_, slate::GridOrder grid_order_) + : m(m_), n(n_), nb(nb_), p(p_), q(q_), grid_order(grid_order_) { int mpi_rank, myrow, mycol; MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - this->mloc = num_local_rows_cols( m_in, nb, myrow, p ); - this->nloc = num_local_rows_cols( n_in, nb, mycol, q ); + this->mloc = num_local_rows_cols( m, nb, myrow, p ); + this->nloc = num_local_rows_cols( n, nb, mycol, q ); this->lld = blas::max( 1, mloc ); // local leading dimension of A } @@ -238,13 +239,22 @@ public: // ScaLAPACK configuration int64_t m, n, mloc, nloc, lld, nb; + int p, q; + slate::GridOrder grid_order; #ifdef SLATE_HAVE_SCALAPACK - void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) { + void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) + { int64_t info; scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info); slate_assert(info == 0); } + + void create_ScaLAPACK_context( blas_int* ictxt ) + { + // Call free function version + ::create_ScaLAPACK_context( grid_order, p, q, ictxt ); + } #endif }; @@ -289,8 +299,8 @@ TestMatrix> allocate_test_Matrix( Params& params) { // Load params variables - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); + int p = params.grid.m(); + int q = params.grid.n(); slate::Dist dev_dist = params.dev_dist(); int64_t nb = params.nb(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; @@ -408,8 +418,8 @@ TestMatrix> allocate_test_HermitianMatrix( { // Load params variables slate::Uplo uplo = params.uplo(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); + int p = params.grid.m(); + int q = params.grid.n(); slate::Dist dev_dist = params.dev_dist(); int64_t nb = params.nb(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; diff --git a/test/test_add.cc b/test/test_add.cc index d5224e5a9..b0fa9581d 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -50,8 +50,6 @@ void test_add_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -139,7 +137,7 @@ void test_add_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_copy.cc b/test/test_copy.cc index ffeb68e9e..f6f6b6d7d 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -48,8 +48,6 @@ void test_copy_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -141,7 +139,7 @@ void test_copy_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 0e2a56ac9..8d77b7e89 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -46,8 +46,6 @@ void test_gemm_work(Params& params, bool run) int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); int64_t k = params.dim.k(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t lookahead = params.lookahead(); bool ref_only = params.ref() == 'o'; slate::Norm norm = params.norm(); @@ -58,7 +56,6 @@ void test_gemm_work(Params& params, bool run) int verbose = params.verbose(); slate::Target target = params.target(); slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); slate::Method method_gemm = params.method_gemm(); params.matrix.mark(); params.matrixB.mark(); @@ -238,7 +235,7 @@ void test_gemm_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_gesv.cc b/test/test_gesv.cc index 737dca144..b1ccec84c 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -61,8 +61,6 @@ void test_gesv_work(Params& params, bool run) int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t ib = params.ib(); int64_t lookahead = params.lookahead(); int64_t panel_threads = params.panel_threads(); @@ -75,7 +73,6 @@ void test_gesv_work(Params& params, bool run) int timer_level = params.timer_level(); SLATE_UNUSED(verbose); slate::Target target = params.target(); - slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); params.matrixB.mark(); @@ -393,7 +390,7 @@ void test_gesv_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; - create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); diff --git a/test/test_posv.cc b/test/test_posv.cc index f50a04fe2..423257b2c 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -34,8 +34,6 @@ void test_posv_work(Params& params, bool run) slate::Uplo uplo = params.uplo(); int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); bool ref_only = params.ref() == 'o'; @@ -48,7 +46,6 @@ void test_posv_work(Params& params, bool run) int timer_level = params.timer_level(); slate::Origin origin = params.origin(); slate::Target target = params.target(); - slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); params.matrixB.mark(); slate::Method methodTrsm = params.method_trsm(); @@ -355,7 +352,7 @@ void test_posv_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; - create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); diff --git a/test/test_scale.cc b/test/test_scale.cc index b5c99c324..95e01f16a 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -50,8 +50,6 @@ void test_scale_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -130,7 +128,7 @@ void test_scale_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::Max, A ); diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index ea9ce0da6..78fcc64ab 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -58,6 +58,7 @@ void test_scale_row_col_work( Params& params, bool run ) bool trace = params.trace() == 'y'; int verbose = params.verbose(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); mark_params_for_test_Matrix( params ); @@ -144,7 +145,7 @@ void test_scale_row_col_work( Params& params, bool run ) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_max = slate::norm( slate::Norm::Max, A ); @@ -152,7 +153,7 @@ void test_scale_row_col_work( Params& params, bool run ) std::vector Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc ); int myrow, mycol; - gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol ); + gridinfo( A.mpiRank(), grid_order, p, q, &myrow, &mycol ); // Copy local part of R. int64_t ii = 0, iilocal = 0; diff --git a/test/test_set.cc b/test/test_set.cc index 78c0a82e1..b32af8660 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -50,8 +50,6 @@ void test_set_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -130,7 +128,7 @@ void test_set_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::One, A ); From aa06be3c75a7c2214ae9a8e4e803a0f2b50770a5 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 12:48:13 -0500 Subject: [PATCH 06/33] Improve thread safety of matrix_iterator --- test/test_utils.hh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.hh b/test/test_utils.hh index 67a3d92ee..02211a394 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -69,9 +69,10 @@ void matrix_iterator( #pragma omp task shared(A, B) \ firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) { + int tag = A_i + A_j * A.mt(); if (B.tileIsLocal( B_i, B_j )) { A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), - slate::Layout::ColMajor ); + slate::Layout::ColMajor, tag ); A.tileGetForReading( A_i, A_j, ColMajor ); B.tileGetForWriting( B_i, B_j, ColMajor ); @@ -93,7 +94,7 @@ void matrix_iterator( } } else if (A.tileIsLocal( A_i, A_j )) { - A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) ); + A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag ); } } From 2681a861c099dfdfc38efe036d89c465e40db7d4 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 15 Dec 2023 12:55:04 -0500 Subject: [PATCH 07/33] Change default device distribution value to match behavior --- test/test.cc | 2 +- test/test_utils.hh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test.cc b/test/test.cc index 078df162a..16ba41ecf 100644 --- a/test/test.cc +++ b/test/test.cc @@ -357,7 +357,7 @@ Params::Params(): method_trsm ("trsm", 4, ParamType::List, 0, str2methodTrsm, methodTrsm2str, "auto=auto, A=trsmA, B=trsmB"), grid_order("go", 3, ParamType::List, slate::GridOrder::Col, str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"), - dev_dist ("dev-dist",9, ParamType::List, slate::Dist::Col, str2dist, dist2str, "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"), + dev_dist ("dev-dist",9, ParamType::List, slate::Dist::Row, str2dist, dist2str, "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"), // name, w, type, default, char2enum, enum2char, enum2str, help layout ("layout", 6, ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"), diff --git a/test/test_utils.hh b/test/test_utils.hh index 02211a394..acedfedb5 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -20,7 +20,7 @@ inline bool is_invalid_parameters(Params& params) slate::Dist dev_dist = params.dev_dist(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; - if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { + if (target != slate::Target::Devices && dev_dist == slate::Dist::Col) { params.msg() = "skipping: dev_dist = Col applies only to target devices"; return true; } From 8758c28ae20c056d8c0f19fa983fdff11d7d2557 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 18 Dec 2023 11:03:27 -0500 Subject: [PATCH 08/33] Add grid order to test/run_tests.py --- test/run_tests.py | 98 ++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index 7bc730139..06c375395 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -127,6 +127,7 @@ group_opt.add_argument( '--nt', action='store', help='default=%(default)s', default='5,10,20' ) group_opt.add_argument( '--np', action='store', help='number of MPI processes; default=%(default)s', default='1' ) group_opt.add_argument( '--grid', action='store', help='use p-by-q MPI process grid', default='' ) +group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='r' ) group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' ) group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' ) group_opt.add_argument( '--matrix', action='store', help='default=%(default)s', default='' ) @@ -302,6 +303,7 @@ nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else '' nt = ' --nt ' + opts.nt if (opts.nt) else '' grid = ' --grid ' + opts.grid if (opts.grid) else '' +grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' repeat = ' --repeat ' + opts.repeat if (opts.repeat) else '' thresh = ' --thresh ' + opts.thresh if (opts.thresh) else '' matrix = ' --matrix ' + opts.matrix if (opts.matrix) else '' @@ -348,9 +350,9 @@ def filter_csv( values, csv ): cmds += [ [ 'gbmm', gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ], - [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], - [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], - [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist ], + [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], + [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], + [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], [ 'hemm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], # todo: hemmA GPU support @@ -386,29 +388,29 @@ def filter_csv( values, csv ): # LU if (opts.lu): cmds += [ - [ 'gesv', gen + dtype + la + n + thresh + nonuniform_nb + ddist ], - [ 'gesv_tntpiv', gen + dtype + la + n + ddist ], - [ 'gesv_nopiv', gen + dtype + la + n + nonuniform_nb + ddist + [ 'gesv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ], + [ 'gesv_tntpiv', gen + dtype + la + n + ddist + grid_order ], + [ 'gesv_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + ' --matrix rand_dominant' ], # todo: mn - [ 'getrf', gen + dtype + la + n + thresh + nonuniform_nb + ddist ], - [ 'getrf_tntpiv', gen + dtype + la + n + ddist ], - [ 'getrf_nopiv', gen + dtype + la + n + nonuniform_nb + ddist + [ 'getrf', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ], + [ 'getrf_tntpiv', gen + dtype + la + n + ddist + grid_order ], + [ 'getrf_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getrs', gen + dtype + la + n + trans + thresh + nonuniform_nb + ddist ], - [ 'getrs_tntpiv', gen + dtype + la + n + trans + ddist ], - [ 'getrs_nopiv', gen + dtype + la + n + trans + nonuniform_nb + ddist + [ 'getrs', gen + dtype + la + n + trans + ddist + grid_order + nonuniform_nb + thresh ], + [ 'getrs_tntpiv', gen + dtype + la + n + ddist + grid_order ], + [ 'getrs_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getri', gen + dtype + la + n + ddist ], - [ 'getriOOP', gen + dtype + la + n + ddist ], + [ 'getri', gen + dtype + la + n + ddist + grid_order ], + [ 'getriOOP', gen + dtype + la + n + ddist + grid_order ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], - [ 'gesv_mixed', gen + dtype_double + la + n + nonuniform_nb + ddist ], - [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb + ddist ], - [ 'gesv_rbt', gen + dtype + la + n + ddist ], + [ 'gesv_mixed', gen + dtype_double + la + n + ddist + grid_order + nonuniform_nb ], + [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + ddist + grid_order + nonuniform_nb ], + [ 'gesv_rbt', gen + dtype + la + n + ddist + grid_order ], ] # LU banded @@ -424,14 +426,14 @@ def filter_csv( values, csv ): # Cholesky if (opts.chol): cmds += [ - [ 'posv', gen + dtype + la + n + uplo ], - [ 'potrf', gen + dtype + la + n + uplo + ddist ], - [ 'potrs', gen + dtype + la + n + uplo ], - [ 'potri', gen + dtype + la + n + uplo ], + [ 'posv', gen + dtype + la + n + uplo + ddist + grid_order ], + [ 'potrf', gen + dtype + la + n + uplo + ddist + grid_order ], + [ 'potrs', gen + dtype + la + n + uplo + ddist + grid_order ], + [ 'potri', gen + dtype + la + n + uplo + ddist + grid_order ], #[ 'porfs', gen + dtype + la + n + uplo ], #[ 'poequ', gen + dtype + la + n ], # only diagonal elements (no uplo) - [ 'posv_mixed', gen + dtype_double + la + n + uplo ], - [ 'posv_mixed_gmres', gen + dtype_double + la + n + uplo + ' --nrhs 1' ], + [ 'posv_mixed', gen + dtype_double + la + n + uplo + ddist + grid_order ], + [ 'posv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + uplo + ddist + grid_order ], [ 'trtri', gen + dtype + la + n + uplo + diag ], ] @@ -628,31 +630,31 @@ def filter_csv( values, csv ): # aux if (opts.aux): cmds += [ - [ 'add', gen + dtype + mn + ab + nonuniform_nb + ddist ], - [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], - [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - [ 'syadd', gen + dtype + n + ab + nonuniform_nb + ddist + ddist + uplo ], - [ 'headd', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - - [ 'copy', gen + dtype + mn + nonuniform_nb + ddist ], - [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ddist + uplo ], - [ 'trcopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], - [ 'sycopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], - [ 'hecopy', gen + dtype + n + nonuniform_nb + ddist + uplo ], - - [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ddist ], - [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], - [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - [ 'syscale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - [ 'hescale', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - - [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist ], - - [ 'set', gen + dtype + mn + ab + nonuniform_nb + ddist ], - [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ddist + uplo ], - [ 'trset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - [ 'syset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], - [ 'heset', gen + dtype + n + ab + nonuniform_nb + ddist + uplo ], + [ 'add', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], + [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'syadd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'headd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + + [ 'copy', gen + dtype + mn + nonuniform_nb + ddist + grid_order ], + [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ddist + grid_order + uplo ], + [ 'trcopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], + [ 'sycopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], + [ 'hecopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], + + [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], + [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'syscale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'hescale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + + [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist + grid_order ], + + [ 'set', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], + [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'trset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'syset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'heset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], ] # ------------------------------------------------------------------------------ From fa4a5c361570d976a44655b417dd2ba23ceb6093 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 18 Dec 2023 15:31:02 -0500 Subject: [PATCH 09/33] Add options to hemm and symm, plus fix some other minor issues --- test/matrix_utils.hh | 110 +++++++++++++++++------ test/run_tests.py | 8 +- test/test.hh | 8 +- test/test_gemm.cc | 10 +-- test/test_hemm.cc | 190 ++++++++++++++++----------------------- test/test_posv.cc | 1 + test/test_symm.cc | 208 ++++++++++++++++++------------------------- 7 files changed, 259 insertions(+), 276 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 18a007f14..59f9f3aa4 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -360,7 +360,6 @@ TestMatrix> allocate_test_Matrix( // Setup reference matrix if (ref_matrix) { if (nonuniform_nb && nonuniform_ref) { - std::cout << "Using nonuniform Bref" << std::endl; matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD ); matrix.Aref.insertLocalTiles( slate::Target::Host ); @@ -391,26 +390,18 @@ inline void mark_params_for_test_HermitianMatrix(Params& params) } // ----------------------------------------------------------------------------- -/// Allocates a HermitianMatrix and a reference version for testing. -/// -/// @param ref_matrix[in] -/// Whether to allocate a reference matrix -/// -/// @param nonuniform_ref[in] -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param m[in] -/// The number of rows -/// -/// @param n[in] -/// The number of columns -/// -/// @param params[in] -/// The test params object which contains many of the key parameters +/// Marks the paramters used by allocate_test_SymmetricMatrix +inline void mark_params_for_test_SymmetricMatrix(Params& params) +{ + mark_params_for_test_HermitianMatrix( params ); +} + + +// ----------------------------------------------------------------------------- +/// Helper routine to avoid duplicating logic between HermitianMatrix and SymmetricMatrix /// -template -TestMatrix> allocate_test_HermitianMatrix( +template +TestMatrix allocate_test_HeSyMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, @@ -427,7 +418,7 @@ TestMatrix> allocate_test_HermitianMatrix( slate::GridOrder grid_order = params.grid_order(); // The object to be returned - TestMatrix> matrix ( n, n, nb, p, q, grid_order ); + TestMatrix matrix ( n, n, nb, p, q, grid_order ); // Functions for nonuniform tile sizes or row device distribution std::function< int64_t (int64_t j) > tileNb; @@ -448,12 +439,12 @@ TestMatrix> allocate_test_HermitianMatrix( // Setup matrix to test SLATE with if (origin != slate::Origin::ScaLAPACK) { if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = slate::HermitianMatrix( + matrix.A = matrixtype( uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); } else { - matrix.A = slate::HermitianMatrix( - uplo, n, nb, p, q, MPI_COMM_WORLD); + matrix.A = matrixtype( + uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD); } // SLATE allocates CPU or GPU tiles. @@ -465,26 +456,87 @@ TestMatrix> allocate_test_HermitianMatrix( assert( dev_dist == slate::Dist::Row ); // Create SLATE matrix from the ScaLAPACK layouts matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &matrix.A_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD); + matrix.A = matrixtype::fromScaLAPACK( + uplo, n, &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); } // Setup reference matrix if (ref_matrix) { if (nonuniform_nb && nonuniform_ref) { - matrix.A = slate::HermitianMatrix( + matrix.A = matrixtype( uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); matrix.Aref.insertLocalTiles( slate::Target::Host ); } else { matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &matrix.Aref_data[0], matrix.lld, nb, p, q, MPI_COMM_WORLD); + matrix.Aref = matrixtype::fromScaLAPACK( + uplo, n, &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); } } return matrix; } +// ----------------------------------------------------------------------------- +/// Allocates a HermitianMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} + +// ----------------------------------------------------------------------------- +/// Allocates a SymmetricMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/run_tests.py b/test/run_tests.py index 06c375395..b691e3523 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -354,10 +354,10 @@ def filter_csv( values, csv ): [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], - [ 'hemm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], + [ 'hemm', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], # todo: hemmA GPU support - [ 'hemmA', gen_no_target + dtype + la + side + uplo + mn + ab + matrixBC ], - [ 'hemmC', gen + dtype + la + side + uplo + mn + ab + matrixBC], + [ 'hemmA', gen_no_target + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], + [ 'hemmC', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC], [ 'hbmm', gen + dtype + la + side + uplo + mn + ab + kd + matrixBC ], @@ -367,7 +367,7 @@ def filter_csv( values, csv ): [ 'her2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixBC ], [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ], - [ 'symm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], + [ 'symm', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], [ 'syr2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixC ], [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ], diff --git a/test/test.hh b/test/test.hh index 0c62fb9f0..57c428443 100644 --- a/test/test.hh +++ b/test/test.hh @@ -21,10 +21,10 @@ // ----------------------------------------------------------------------------- namespace slate { -enum class Origin { - Host, - ScaLAPACK, - Devices, +enum class Origin : char { + Host = 'H', + ScaLAPACK = 'S', + Devices = 'D', }; enum class Dist : char { diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 8d77b7e89..ca1356fb4 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -76,6 +76,11 @@ void test_gemm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + #ifndef SLATE_HAVE_SCALAPACK // Can run ref only when we have ScaLAPACK. if (ref) { @@ -85,11 +90,6 @@ void test_gemm_work(Params& params, bool run) } #endif - // Check for common invalid combinations - if (is_invalid_parameters( params )) { - return; - } - slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, diff --git a/test/test_hemm.cc b/test/test_hemm.cc index 946a60798..45d5f50a2 100644 --- a/test/test_hemm.cc +++ b/test/test_hemm.cc @@ -8,10 +8,12 @@ #include "blas/flops.hh" #include "print_matrix.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -45,12 +47,12 @@ void test_hemm_work(Params& params, bool run) int p = params.grid.m(); int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); slate::Origin origin = params.origin(); slate::Target target = params.target(); @@ -59,6 +61,9 @@ void test_hemm_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -72,6 +77,20 @@ void test_hemm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + + #ifndef SLATE_HAVE_SCALAPACK + // Can only run ref when we have ScaLAPACK + if (ref) { + if (mpi_rank == 0) + printf( "ScaLAPACK not available\n" ); + ref = false; + } + #endif + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -85,7 +104,6 @@ void test_hemm_work(Params& params, bool run) // sizes of data int64_t An = (side == slate::Side::Left ? m : n); - int64_t Am = An; int64_t Bm = m; int64_t Bn = n; int64_t Cm = m; @@ -96,82 +114,45 @@ void test_hemm_work(Params& params, bool run) MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); gridinfo(mpi_rank, p, q, &myrow, &mycol); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } - - slate::HermitianMatrix A; - slate::Matrix B, C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::HermitianMatrix(uplo, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_HermitianMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::Matrix(Cm, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::Matrix::fromScaLAPACK( - Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A); slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - std::vector Cref_data; - slate::Matrix Cref; - if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Cref_data.resize( lldC * nlocC ); - Cref = slate::Matrix::fromScaLAPACK( - m, n, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy(C, Cref); - } - #endif + if (ref) { + slate::copy( C, Cref ); + } + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); if (side == slate::Side::Left ) { // Compute Y = alpha A * (B * X) + (beta C * X). @@ -237,6 +218,9 @@ void test_hemm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -254,57 +238,36 @@ void test_hemm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; + if (nonuniform_nb) { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + return; + } // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. copy(A, &A_data[0], A_desc); copy(B, &B_data[0], B_desc); copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocA, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocA + mlocA + ldw); - std::vector worklange(std::max({mlocC, nlocC, mlocB, nlocB})); - - // get norms of the original data - real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An, - &A_data[0], 1, 1, A_desc, &worklansy[0]); - real_t B_norm = scalapack_plange( - norm2str(norm), Bm, Bn, &B_data[0],1, 1, B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plange( - norm2str(norm), Cm, Cn, &Cref_data[0], 1, 1, Cref_desc, &worklange[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -315,12 +278,13 @@ void test_hemm_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); + + print_matrix( "Diff", C, params ); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm diff --git a/test/test_posv.cc b/test/test_posv.cc index 423257b2c..6193a07ef 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -52,6 +52,7 @@ void test_posv_work(Params& params, bool run) slate::Method methodHemm = params.method_hemm(); mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); // Currently only posv* supports timer_level >= 2. std::vector timer_lvl_support{ "posv", "posv_mixed", diff --git a/test/test_symm.cc b/test/test_symm.cc index c80dd3d98..6a8dc515d 100644 --- a/test/test_symm.cc +++ b/test/test_symm.cc @@ -6,11 +6,14 @@ #include "slate/slate.hh" #include "test.hh" #include "blas/flops.hh" +#include "print_matrix.hh" + +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,18 +41,21 @@ void test_symm_work(Params& params, bool run) int p = params.grid.m(); int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -63,6 +69,20 @@ void test_symm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + + #ifndef SLATE_HAVE_SCALAPACK + // Can only run ref when we have ScaLAPACK + if (ref) { + if (mpi_rank == 0) + printf( "ScaLAPACK not available\n" ); + ref = false; + } + #endif + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -73,7 +93,6 @@ void test_symm_work(Params& params, bool run) // sizes of data int64_t An = (side == slate::Side::Left ? m : n); - int64_t Am = An; int64_t Bm = m; int64_t Bn = n; int64_t Cm = m; @@ -84,91 +103,45 @@ void test_symm_work(Params& params, bool run) MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); gridinfo(mpi_rank, p, q, &myrow, &mycol); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } - - slate::SymmetricMatrix A; - slate::Matrix B, C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::SymmetricMatrix(uplo, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_SymmetricMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::Matrix(Cm, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A = slate::SymmetricMatrix::fromScaLAPACK( - uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::Matrix::fromScaLAPACK( - Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A); slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data and create a descriptor for it. - slate::Matrix Cref; - std::vector Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::Matrix::fromScaLAPACK( - Cm, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif - - if (side == slate::Side::Left) - slate_assert(A.mt() == C.mt()); - else - slate_assert(A.mt() == C.nt()); - slate_assert(B.mt() == C.mt()); - slate_assert(B.nt() == C.nt()); + if (ref) { + slate::copy( C, Cref ); + } - if (trace) slate::trace::Trace::on(); - else slate::trace::Trace::off(); + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); if (side == slate::Side::Left ) { // Compute Y = alpha A * (B * X) + (beta C * X). @@ -192,6 +165,16 @@ void test_symm_work(Params& params, bool run) throw slate::Exception("unknown side"); } + if (side == slate::Side::Left) + slate_assert(A.mt() == C.mt()); + else + slate_assert(A.mt() == C.nt()); + slate_assert(B.mt() == C.mt()); + slate_assert(B.nt() == C.nt()); + + if (trace) slate::trace::Trace::on(); + else slate::trace::Trace::off(); + double time = barrier_get_wtime(MPI_COMM_WORLD); //================================================== @@ -218,6 +201,9 @@ void test_symm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -235,57 +221,36 @@ void test_symm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; + if (nonuniform_nb) { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + return; + } // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. copy(A, &A_data[0], A_desc); copy(B, &B_data[0], B_desc); copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - size_t ldw = nb*ceildiv( ceildiv( mlocA, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocA + mlocA + ldw); - std::vector worklange(std::max({mlocC, nlocC, mlocB, nlocB})); - - // get norms of the original data - real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An, - &A_data[0], 1, 1, A_desc, &worklansy[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, - B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -297,12 +262,13 @@ void test_symm_work(Params& params, bool run) MPI_Barrier(MPI_COMM_WORLD); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); + + print_matrix( "Diff", C, params ); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm From 08272710ff3857cfc778ee84fdf5792f19acde08 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 18 Dec 2023 15:53:03 -0500 Subject: [PATCH 10/33] Refactor out the handling of some reference warning messages --- test/test_add.cc | 2 +- test/test_copy.cc | 2 +- test/test_gemm.cc | 14 -------------- test/test_gesv.cc | 7 +------ test/test_hemm.cc | 23 +---------------------- test/test_posv.cc | 9 +-------- test/test_scale.cc | 2 +- test/test_scale_row_col.cc | 2 +- test/test_set.cc | 2 +- test/test_symm.cc | 23 +---------------------- test/test_utils.hh | 14 ++++++++++++++ 11 files changed, 23 insertions(+), 77 deletions(-) diff --git a/test/test_add.cc b/test/test_add.cc index b0fa9581d..b6fa65700 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -199,7 +199,7 @@ void test_add_work(Params& params, bool run) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( trans ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_copy.cc b/test/test_copy.cc index f6f6b6d7d..f8aa9bd1c 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -187,7 +187,7 @@ void test_copy_work(Params& params, bool run) #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( A_norm ); SLATE_UNUSED( B_norm ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_gemm.cc b/test/test_gemm.cc index ca1356fb4..359b0912b 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -52,7 +52,6 @@ void test_gemm_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); slate::Target target = params.target(); slate::Origin origin = params.origin(); @@ -81,15 +80,6 @@ void test_gemm_work(Params& params, bool run) return; } - #ifndef SLATE_HAVE_SCALAPACK - // Can run ref only when we have ScaLAPACK. - if (ref) { - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - ref = false; - } - #endif - slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -228,10 +218,6 @@ void test_gemm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; diff --git a/test/test_gesv.cc b/test/test_gesv.cc index b1ccec84c..422dc686a 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -68,7 +68,6 @@ void test_gesv_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); int timer_level = params.timer_level(); SLATE_UNUSED(verbose); @@ -383,10 +382,6 @@ void test_gesv_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; @@ -431,7 +426,7 @@ void test_gesv_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_hemm.cc b/test/test_hemm.cc index 45d5f50a2..b71708064 100644 --- a/test/test_hemm.cc +++ b/test/test_hemm.cc @@ -44,15 +44,12 @@ void test_hemm_work(Params& params, bool run) int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); slate::Origin origin = params.origin(); slate::Target target = params.target(); @@ -82,15 +79,6 @@ void test_hemm_work(Params& params, bool run) return; } - #ifndef SLATE_HAVE_SCALAPACK - // Can only run ref when we have ScaLAPACK - if (ref) { - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - ref = false; - } - #endif - slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -109,11 +97,6 @@ void test_hemm_work(Params& params, bool run) int64_t Cm = m; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - auto A_alloc = allocate_test_HermitianMatrix( false, true, An, params ); auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); @@ -238,10 +221,6 @@ void test_hemm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; @@ -300,7 +279,7 @@ void test_hemm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_posv.cc b/test/test_posv.cc index 6193a07ef..161413a70 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -40,7 +40,6 @@ void test_posv_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; bool hold_local_workspace = params.hold_local_workspace() == 'y'; int verbose = params.verbose(); int timer_level = params.timer_level(); @@ -346,10 +345,6 @@ void test_posv_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; @@ -401,9 +396,7 @@ void test_posv_work(Params& params, bool run) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( verbose ); - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_scale.cc b/test/test_scale.cc index 95e01f16a..d6a71c5af 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -168,7 +168,7 @@ void test_scale_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index 78fcc64ab..e239beb0d 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -238,7 +238,7 @@ void test_scale_row_col_work( Params& params, bool run ) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_set.cc b/test/test_set.cc index b32af8660..5e1fdf024 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -164,7 +164,7 @@ void test_set_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_symm.cc b/test/test_symm.cc index 6a8dc515d..50bb601ec 100644 --- a/test/test_symm.cc +++ b/test/test_symm.cc @@ -38,15 +38,12 @@ void test_symm_work(Params& params, bool run) int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); @@ -74,15 +71,6 @@ void test_symm_work(Params& params, bool run) return; } - #ifndef SLATE_HAVE_SCALAPACK - // Can only run ref when we have ScaLAPACK - if (ref) { - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - ref = false; - } - #endif - slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -98,11 +86,6 @@ void test_symm_work(Params& params, bool run) int64_t Cm = m; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - auto A_alloc = allocate_test_SymmetricMatrix( false, true, An, params ); auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); @@ -221,10 +204,6 @@ void test_symm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; @@ -283,7 +262,7 @@ void test_symm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_utils.hh b/test/test_utils.hh index acedfedb5..c16d334a7 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -35,6 +35,20 @@ inline bool is_invalid_parameters(Params& params) return true; } + #ifdef SLATE_HAVE_SCALAPACK + if (nonuniform_nb && params.ref()) { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + params.ref() = false; + } + #else + // Can only run ref when we have ScaLAPACK + if (params.ref()) { + params.msg() = "skipping reference: ScaLAPACK not available"; + params.ref() = false; + } + #endif + + return false; } From 08d43b590fd4747e0363822157ba0d5f197c8d04 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 18 Dec 2023 16:34:32 -0500 Subject: [PATCH 11/33] Add options to sy/herk and sy/her2k --- test/run_tests.py | 16 ++-- test/test_her2k.cc | 198 +++++++++++++++++---------------------------- test/test_herk.cc | 176 +++++++++++++++------------------------- test/test_syr2k.cc | 190 +++++++++++++++---------------------------- test/test_syrk.cc | 166 ++++++++++++++----------------------- 5 files changed, 271 insertions(+), 475 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index b691e3523..584b96bf7 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -361,19 +361,19 @@ def filter_csv( values, csv ): [ 'hbmm', gen + dtype + la + side + uplo + mn + ab + kd + matrixBC ], - [ 'herk', gen + dtype_real + la + uplo + trans + mn + ab + matrixC ], - [ 'herk', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixC ], + [ 'herk', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixC ], + [ 'herk', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixC ], - [ 'her2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixBC ], - [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixBC ], [ 'symm', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], - [ 'syr2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixC ], - [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixC ], - [ 'syrk', gen + dtype_real + la + uplo + trans + mn + ab + matrixBC ], - [ 'syrk', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixBC ], # todo: tbsm fails for nb=8 or 16 with --quick. [ 'tbsm', gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ], diff --git a/test/test_her2k.cc b/test/test_her2k.cc index a2af3cef2..3c0ec777e 100644 --- a/test/test_her2k.cc +++ b/test/test_her2k.cc @@ -8,10 +8,12 @@ #include "print_matrix.hh" #include "blas/flops.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,10 +40,7 @@ void test_her2k_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); real_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -53,6 +52,9 @@ void test_her2k_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -66,6 +68,11 @@ void test_her2k_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -81,72 +88,32 @@ void test_her2k_work(Params& params, bool run) int64_t An = (trans == slate::Op::NoTrans ? k : n); int64_t Bm = Am; int64_t Bn = An; - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector< scalar_t > A_data(lldA*nlocA); - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - std::vector< scalar_t > B_data(lldB*nlocB); - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - std::vector< scalar_t > C_data(lldC*nlocC); - - slate::Matrix A, B; - slate::HermitianMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::HermitianMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_HermitianMatrix( ref, true, Cn, params ); + + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - slate::HermitianMatrix Cref; - std::vector< scalar_t > Cref_data; - if (ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - print_matrix("Initial Cref", Cref, params); - } - #endif + if (ref) { + slate::copy( C, Cref ); + } + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A and B matrices, // and make a shallow copy of them for transposing. @@ -172,17 +139,19 @@ void test_her2k_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = (alpha A (B^H X)) + (conj(alpha) B (A^H X)) + (beta C X). // Y = beta C X @@ -221,6 +190,9 @@ void test_her2k_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -239,54 +211,30 @@ void test_her2k_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - copy( C, &C_data[0], C_desc ); - - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy( 2*nlocC + mlocC + ldw ); - std::vector worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) ); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, - A_desc, &worklange[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, - B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + + // Copy SLATE result back from GPU or CPU tiles. + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + copy(C, &C_data[0], C_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -300,13 +248,11 @@ void test_her2k_work(Params& params, bool run) print_matrix("Cref", Cref, params); - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm @@ -323,7 +269,7 @@ void test_her2k_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_herk.cc b/test/test_herk.cc index 8e5660241..9435bb550 100644 --- a/test/test_herk.cc +++ b/test/test_herk.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "blas/flops.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -35,10 +37,7 @@ void test_herk_work(Params& params, bool run) int64_t k = params.dim.k(); real_t alpha = params.alpha.get(); real_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -49,6 +48,9 @@ void test_herk_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -62,6 +64,11 @@ void test_herk_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -75,64 +82,28 @@ void test_herk_work(Params& params, bool run) // setup so op(A) is n-by-k int64_t Am = (transA == slate::Op::NoTrans ? n : k); int64_t An = (transA == slate::Op::NoTrans ? k : n); - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - C_data.resize( lldC * nlocC ); - } - - slate::Matrix A; - slate::HermitianMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto C_alloc = allocate_test_HermitianMatrix( ref, true, Cn, params ); - C = slate::HermitianMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - C = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, C ); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - slate::HermitianMatrix Cref; - std::vector Cref_data; - if (ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif + if (ref) { + slate::copy( C, Cref ); + } + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -147,17 +118,19 @@ void test_herk_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = alpha A (A^H X) + (beta C X). // Y = beta C X @@ -189,6 +162,9 @@ void test_herk_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -207,48 +183,26 @@ void test_herk_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); - copy( C, &C_data[0], C_desc ); - - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocC + mlocC + ldw); - std::vector worklange(std::max(mlocA, nlocA)); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, - A_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + + // Copy SLATE result back from GPU or CPU tiles. + copy(A, &A_data[0], A_desc); + copy(C, &C_data[0], C_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -259,13 +213,11 @@ void test_herk_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm @@ -281,7 +233,7 @@ void test_herk_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc index 2ae006ce1..e3fa8cc22 100644 --- a/test/test_syr2k.cc +++ b/test/test_syr2k.cc @@ -8,10 +8,12 @@ #include "print_matrix.hh" #include "blas/flops.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,10 +40,7 @@ void test_syr2k_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -53,6 +52,9 @@ void test_syr2k_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -66,6 +68,11 @@ void test_syr2k_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -79,77 +86,32 @@ void test_syr2k_work(Params& params, bool run) int64_t An = (trans == slate::Op::NoTrans ? k : n); int64_t Bm = Am; int64_t Bn = An; - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } - - slate::Matrix A, B; - slate::SymmetricMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_SymmetricMatrix( ref, true, Cn, params ); - C = slate::SymmetricMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - slate::SymmetricMatrix Cref; - std::vector< scalar_t > Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - print_matrix("Initial Cref", Cref, params); - } - #endif + if (ref) { + slate::copy( C, Cref ); + } + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A and B matrices, // and make a shallow copy of them for transposing. @@ -175,17 +137,19 @@ void test_syr2k_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = (alpha A (B^T X)) + alpha B (A^T X)) + (beta C X). // Y = beta C X @@ -224,6 +188,9 @@ void test_syr2k_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -242,54 +209,31 @@ void test_syr2k_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - copy( C, &C_data[0], C_desc ); + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy( 2*nlocC + mlocC + ldw ); - std::vector worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) ); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -302,11 +246,11 @@ void test_syr2k_work(Params& params, bool run) print_matrix("Cref", Cref, params); - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm @@ -323,7 +267,7 @@ void test_syr2k_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_syrk.cc b/test/test_syrk.cc index 03d6de765..c8683b837 100644 --- a/test/test_syrk.cc +++ b/test/test_syrk.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "blas/flops.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -35,10 +37,7 @@ void test_syrk_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -49,6 +48,9 @@ void test_syrk_work(Params& params, bool run) params.matrix.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -62,6 +64,11 @@ void test_syrk_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -73,64 +80,28 @@ void test_syrk_work(Params& params, bool run) // setup so op(A) is n-by-k int64_t Am = (transA == slate::Op::NoTrans ? n : k); int64_t An = (transA == slate::Op::NoTrans ? k : n); - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - C_data.resize( lldC * nlocC ); - } - - slate::Matrix A; - slate::SymmetricMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto C_alloc = allocate_test_SymmetricMatrix( ref, true, Cn, params ); - C = slate::SymmetricMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - C = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - slate::SymmetricMatrix Cref; - std::vector Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif + if (ref) { + slate::copy( C, Cref ); + } + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -145,17 +116,19 @@ void test_syrk_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = alpha A (A^T X) + (beta C X). // Y = beta C X @@ -187,6 +160,9 @@ void test_syrk_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -205,49 +181,27 @@ void test_syrk_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( C, &C_data[0], C_desc ); + copy(A, &A_data[0], A_desc); + copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocC + mlocC + ldw); - std::vector worklange(std::max(mlocA, nlocA)); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -257,11 +211,11 @@ void test_syrk_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime( MPI_COMM_WORLD ) - time; - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm @@ -277,7 +231,7 @@ void test_syrk_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } From 799cd98127fded46d2f0f8a0708386d9b28c6e12 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 18 Dec 2023 16:41:34 -0500 Subject: [PATCH 12/33] Consolidate some if statements --- test/test_gemm.cc | 15 ++++++--------- test/test_hemm.cc | 6 ++---- test/test_her2k.cc | 6 ++---- test/test_herk.cc | 6 ++---- test/test_symm.cc | 6 ++---- test/test_syr2k.cc | 6 ++---- test/test_syrk.cc | 6 ++---- 7 files changed, 18 insertions(+), 33 deletions(-) diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 359b0912b..c20db8ea0 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -110,9 +110,14 @@ void test_gemm_work(Params& params, bool run) slate::generate_matrix(params.matrixB, B); slate::generate_matrix(params.matrixC, C); - // if reference run is required, copy test data. + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); } if (transA == slate::Op::Trans) @@ -129,14 +134,6 @@ void test_gemm_work(Params& params, bool run) slate_assert(B.nt() == C.nt()); slate_assert(A.nt() == B.mt()); - // If reference run is required, record norms to be used in the check/ref. - real_t A_norm=0, B_norm=0, C_orig_norm=0; - if (ref) { - A_norm = slate::norm(norm, A); - B_norm = slate::norm(norm, B); - C_orig_norm = slate::norm(norm, Cref); - } - // If check run, perform first half of SLATE residual check. TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { diff --git a/test/test_hemm.cc b/test/test_hemm.cc index b71708064..a31612b53 100644 --- a/test/test_hemm.cc +++ b/test/test_hemm.cc @@ -110,13 +110,11 @@ void test_hemm_work(Params& params, bool run) slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); B_norm = slate::norm(norm, B); C_orig_norm = slate::norm(norm, Cref); diff --git a/test/test_her2k.cc b/test/test_her2k.cc index 3c0ec777e..fc8d4077b 100644 --- a/test/test_her2k.cc +++ b/test/test_her2k.cc @@ -103,13 +103,11 @@ void test_her2k_work(Params& params, bool run) slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); B_norm = slate::norm(norm, B); C_orig_norm = slate::norm(norm, Cref); diff --git a/test/test_herk.cc b/test/test_herk.cc index 9435bb550..d3f9144ba 100644 --- a/test/test_herk.cc +++ b/test/test_herk.cc @@ -94,13 +94,11 @@ void test_herk_work(Params& params, bool run) slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, C ); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); C_orig_norm = slate::norm(norm, Cref); } diff --git a/test/test_symm.cc b/test/test_symm.cc index 50bb601ec..e463b5d34 100644 --- a/test/test_symm.cc +++ b/test/test_symm.cc @@ -99,13 +99,11 @@ void test_symm_work(Params& params, bool run) slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); B_norm = slate::norm(norm, B); C_orig_norm = slate::norm(norm, Cref); diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc index e3fa8cc22..a42aaf6c9 100644 --- a/test/test_syr2k.cc +++ b/test/test_syr2k.cc @@ -101,13 +101,11 @@ void test_syr2k_work(Params& params, bool run) slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); B_norm = slate::norm(norm, B); C_orig_norm = slate::norm(norm, Cref); diff --git a/test/test_syrk.cc b/test/test_syrk.cc index c8683b837..bc8e6e1d0 100644 --- a/test/test_syrk.cc +++ b/test/test_syrk.cc @@ -92,13 +92,11 @@ void test_syrk_work(Params& params, bool run) slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixC, C ); - if (ref) { - slate::copy( C, Cref ); - } - // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, C_orig_norm=0; if (ref) { + slate::copy( C, Cref ); + A_norm = slate::norm(norm, A); C_orig_norm = slate::norm(norm, Cref); } From 8b00be41e64f5b50b9600414fe0f2cf9f29e820c Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 19 Dec 2023 12:27:17 -0500 Subject: [PATCH 13/33] Add options to trmm and trsm --- test/matrix_utils.hh | 115 ++++++++++++++++++++++++++++++ test/run_tests.py | 8 +-- test/test_trmm.cc | 162 ++++++++++++++++--------------------------- test/test_trsm.cc | 115 +++++++++--------------------- 4 files changed, 211 insertions(+), 189 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 59f9f3aa4..27fda8bac 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -539,4 +539,119 @@ TestMatrix> allocate_test_SymmetricMatrix( ref_matrix, nonuniform_ref, n, params ); } +// ----------------------------------------------------------------------------- +/// Marks the paramters used by allocate_test_HermitianMatrix +inline void mark_params_for_test_TriangularMatrix(Params& params) +{ + params.grid.m(); + params.grid.n(); + params.dev_dist(); + params.uplo(); + params.diag(); + params.nb(); + params.nonuniform_nb(); + params.origin(); + params.grid_order(); +} + +// ----------------------------------------------------------------------------- +/// Allocates a SymmetricMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + slate::Diag diag = params.diag(); + int p = params.grid.m(); + int q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix> matrix ( n, n, nb, p, q, grid_order ); + + // Functions for nonuniform tile sizes or row device distribution + std::function< int64_t (int64_t j) > tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + } + else { + tileNb = slate::func::uniform_blocksize( n, nb ); + } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + if (nonuniform_nb || dev_dist == slate::Dist::Col) { + matrix.A = slate::TriangularMatrix( + uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = slate::TriangularMatrix( + uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD); + } + + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target(origin); + matrix.A.insertLocalTiles(origin_target); + } + else { + assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.Aref = slate::TriangularMatrix( + uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + } + + return matrix; +} + + #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/run_tests.py b/test/run_tests.py index 584b96bf7..1de26268b 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -378,11 +378,11 @@ def filter_csv( values, csv ): # todo: tbsm fails for nb=8 or 16 with --quick. [ 'tbsm', gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ], - [ 'trmm', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], + [ 'trmm', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], - [ 'trsm', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], - [ 'trsmA', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], - [ 'trsmB', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], + [ 'trsm', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], + [ 'trsmA', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], + [ 'trsmB', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], ] # LU diff --git a/test/test_trmm.cc b/test/test_trmm.cc index 26e0b3b64..941ee2ecb 100644 --- a/test/test_trmm.cc +++ b/test/test_trmm.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "blas/flops.hh" +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,9 +40,6 @@ void test_trmm_work(Params& params, bool run) int64_t nrhs = params.nrhs(); scalar_t alpha = params.alpha.get(); slate::Op transB = params.transB(); - int p = params.grid.m(); - int q = params.grid.n(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -51,6 +50,9 @@ void test_trmm_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_TriangularMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -65,6 +67,11 @@ void test_trmm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -79,60 +86,24 @@ void test_trmm_work(Params& params, bool run) int64_t Bm = (transB == slate::Op::NoTrans ? m : n); int64_t Bn = (transB == slate::Op::NoTrans ? n : m); - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - } - - slate::TriangularMatrix A; - slate::Matrix B; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::TriangularMatrix(uplo, diag, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_TriangularMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( ref, true, Bm, Bn, params ); - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& Bref = B_alloc.Aref; generate_matrix( params.matrix, A ); generate_matrix( params.matrixB, B ); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - std::vector Bref_data; - if (ref) { - Bref_data.resize( lldB * nlocB ); - auto Bref = slate::Matrix::fromScaLAPACK( - Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - slate::copy( B, Bref ); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_orig_norm=0; + if (ref) { + slate::copy( B, Bref ); + + A_norm = slate::norm(norm, A); + B_orig_norm = slate::norm(norm, B); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -148,14 +119,16 @@ void test_trmm_work(Params& params, bool run) B = conj_transpose( B ); // If check run, perform first half of SLATE residual check. - slate::Matrix X, X2, Y; + TestMatrix> X_alloc, X2_alloc, Y_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - X2 = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X2.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + X2_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + + auto& X = X_alloc.A; + auto& X2 = X2_alloc.A; + auto& Y = Y_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); generate_matrix( mp, X ); @@ -207,6 +180,9 @@ void test_trmm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, B*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -224,48 +200,27 @@ void test_trmm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], Bref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - if (origin != slate::Origin::ScaLAPACK) { - // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - } + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); - // allocate workspace for norms - std::vector worklantr(std::max(mlocA, nlocA)); - std::vector worklange(std::max(mlocB, nlocB)); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& Bref_data = B_alloc.Aref_data; - // get norms of the original data - real_t A_norm = scalapack_plantr( - norm2str(norm), uplo2str(uplo), diag2str(diag), Am, An, &A_data[0], - 1, 1, A_desc, &worklantr[0]); - real_t B_orig_norm = scalapack_plange( - norm2str(norm), Bm, Bn, &Bref_data[0], 1, 1, B_desc, &worklange[0]); + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + + // Copy SLATE matrix into ScaLAPACK matrix + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -277,12 +232,11 @@ void test_trmm_work(Params& params, bool run) &Bref_data[0], 1, 1, Bref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Bref_data - B_data - blas::axpy(Bref_data.size(), -1.0, &B_data[0], 1, &Bref_data[0], 1); + // get differences B = B - Bref + slate::add(-one, Bref, one, B); - // norm(Bref_data - B_data) - real_t B_diff_norm = scalapack_plange(norm2str(norm), Bm, Bn, &Bref_data[0], - 1, 1, Bref_desc, &worklange[0]); + // norm(B - Bref) + real_t B_diff_norm = slate::norm(norm, B); real_t error = B_diff_norm / (sqrt(real_t(Am) + 2) * std::abs(alpha) * A_norm * B_orig_norm); @@ -298,7 +252,7 @@ void test_trmm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_trsm.cc b/test/test_trsm.cc index 24e3d3360..5f4bd7045 100644 --- a/test/test_trsm.cc +++ b/test/test_trsm.cc @@ -6,12 +6,14 @@ #include "slate/slate.hh" #include "test.hh" #include "blas/flops.hh" +#include "print_matrix.hh" + +#include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "print_matrix.hh" -#include "grid_utils.hh" #include #include @@ -44,9 +46,6 @@ void test_trsm_work(Params& params, bool run) int64_t m = params.dim.m(); int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); - int p = params.grid.m(); - int q = params.grid.n(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -58,6 +57,9 @@ void test_trsm_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_TriangularMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -72,6 +74,11 @@ void test_trsm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -87,49 +94,12 @@ void test_trsm_work(Params& params, bool run) int64_t Bm = m; int64_t Bn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - } + auto A_alloc = allocate_test_TriangularMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( check || ref, true, Bm, Bn, params ); - slate::TriangularMatrix A; - slate::Matrix B; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::TriangularMatrix( - uplo, diag, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix( - Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - - B_data.resize( lldB * nlocB ); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& Bref = B_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); @@ -140,13 +110,8 @@ void test_trsm_work(Params& params, bool run) auto AH = slate::HermitianMatrix( A ); slate::potrf( AH, opts ); - // if check is required, copy test data - std::vector< scalar_t > Bref_data; - slate::Matrix Bref; + // If reference run is required, record norms to be used in the check/ref. if (check || ref) { - Bref_data.resize( lldB * nlocB ); - Bref = slate::Matrix::fromScaLAPACK( - Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); slate::copy( B, Bref ); } @@ -220,34 +185,22 @@ void test_trsm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK for timing only - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); + blas_int ictxt, A_desc[9], Bref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); + + auto& A_data = A_alloc.A_data; + auto& Bref_data = B_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + // Copy SLATE matrix into ScaLAPACK matrix + copy(A, &A_data[0], A_desc); + } //================================================== // Run ScaLAPACK reference routine. From 782c4821750916d979844ffbe24b833973e0f8b6 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 19 Dec 2023 13:19:07 -0500 Subject: [PATCH 14/33] Move allocate_test_* functions to cc file --- GNUmakefile | 1 + test/matrix_utils.cc | 475 +++++++++++++++++++++++++++++++++++++++++++ test/matrix_utils.hh | 365 ++------------------------------- 3 files changed, 494 insertions(+), 347 deletions(-) create mode 100644 test/matrix_utils.cc diff --git a/GNUmakefile b/GNUmakefile index 604394716..b71c10c0c 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -702,6 +702,7 @@ endif tester_src += \ test/matrix_generator.cc \ test/matrix_params.cc \ + test/matrix_utils.cc \ test/random.cc \ test/test.cc \ test/test_add.cc \ diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc new file mode 100644 index 000000000..b157e97ee --- /dev/null +++ b/test/matrix_utils.cc @@ -0,0 +1,475 @@ +// Copyright (c) 2017-2023, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "matrix_utils.hh" + +//------------------------------------------------------------------------------ +/// Allocates a Matrix and optionally a reference version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params) +{ + // Load params variables + int p = params.grid.m(); + int q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix> matrix( m, n, nb, p, q, grid_order ); + + // Functions for nonuniform tile sizes or row device distribution + std::function< int64_t (int64_t j) > tileMb, tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + tileMb = tileNb; + } + else { + tileMb = slate::func::uniform_blocksize( m, nb ); + tileNb = slate::func::uniform_blocksize( n, nb ); + } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target( origin ); + if (nonuniform_nb || dev_dist == slate::Dist::Col) { + matrix.A = slate::Matrix( m, n, tileMb, tileNb, tileRank, + tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = slate::Matrix( m, n, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + matrix.A.insertLocalTiles( origin_target ); + } + else { + assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = slate::Matrix::fromScaLAPACK( + m, n, &matrix.A_data[0], matrix.lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, + tileDevice, MPI_COMM_WORLD ); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = slate::Matrix::fromScaLAPACK( + m, n, &matrix.Aref_data[0], matrix.lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + } + + return matrix; +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params); + +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_Matrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_Matrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params); + +//------------------------------------------------------------------------------ +/// Helper routine to avoid duplicating logic between HermitianMatrix +/// and SymmetricMatrix +/// +template +TestMatrix allocate_test_HeSyMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + int p = params.grid.m(); + int q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix matrix ( n, n, nb, p, q, grid_order ); + + // Functions for nonuniform tile sizes or row device distribution + std::function< int64_t (int64_t j) > tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + } + else { + tileNb = slate::func::uniform_blocksize( n, nb ); + } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + if (nonuniform_nb || dev_dist == slate::Dist::Col) { + matrix.A = matrixtype( + uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = matrixtype( + uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD); + } + + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target(origin); + matrix.A.insertLocalTiles(origin_target); + } + else { + assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = matrixtype::fromScaLAPACK( + uplo, n, &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.A = matrixtype( + uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = matrixtype::fromScaLAPACK( + uplo, n, &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + } + + return matrix; +} + +//------------------------------------------------------------------------------ +/// Allocates a HermitianMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_HermitianMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_HermitianMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +//------------------------------------------------------------------------------ +/// Allocates a SymmetricMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_SymmetricMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_SymmetricMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + + +//------------------------------------------------------------------------------ +/// Allocates a SymmetricMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + slate::Diag diag = params.diag(); + int p = params.grid.m(); + int q = params.grid.n(); + slate::Dist dev_dist = params.dev_dist(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix> matrix ( n, n, nb, p, q, grid_order ); + + // Functions for nonuniform tile sizes or row device distribution + std::function< int64_t (int64_t j) > tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + } + else { + tileNb = slate::func::uniform_blocksize( n, nb ); + } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + if (nonuniform_nb || dev_dist == slate::Dist::Col) { + matrix.A = slate::TriangularMatrix( + uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = slate::TriangularMatrix( + uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD); + } + + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target(origin); + matrix.A.insertLocalTiles(origin_target); + } + else { + assert( !nonuniform_nb ); + assert( dev_dist == slate::Dist::Row ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.Aref = slate::TriangularMatrix( + uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q, MPI_COMM_WORLD); + } + } + + return matrix; +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_TriangularMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); + +template +TestMatrix>> allocate_test_TriangularMatrix>( + bool ref_matrix, + bool nonuniform_ref, + int64_t n, + Params& params); diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 27fda8bac..b3db03c62 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -7,6 +7,7 @@ #define SLATE_MATRIX_UTILS_HH #include "slate/slate.hh" +#include "test.hh" #include "scalapack_wrappers.hh" #include "grid_utils.hh" @@ -272,386 +273,56 @@ inline void mark_params_for_test_Matrix(Params& params) } //------------------------------------------------------------------------------ -/// Allocates a Matrix and optionally a reference version for testing. -/// -/// @param[in] ref_matrix -/// Whether to allocate a reference matrix -/// -/// @param[in] nonuniform_ref -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param[in] m -/// The number of rows -/// -/// @param[in] n -/// The number of columns -/// -/// @param[in] params -/// The test params object which contains many of the key parameters -/// -template -TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params) -{ - // Load params variables - int p = params.grid.m(); - int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix> matrix( m, n, nb, p, q, grid_order ); - - // Functions for nonuniform tile sizes. - // Odd-numbered tiles are 2*nb, even-numbered tiles are nb. - std::function< int64_t (int64_t j) > tileMb, tileNb; - if (nonuniform_nb) { - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb*2 : nb); - }; - tileMb = tileNb; - } - else { - tileMb = slate::func::uniform_blocksize( m, nb ); - tileNb = slate::func::uniform_blocksize( n, nb ); - } - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - if (nonuniform_nb) { - params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) ) - + ", " + std::to_string( tileNb( 1 ) ); - } - if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = slate::Matrix( m, n, tileMb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = slate::Matrix( m, n, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - matrix.A.insertLocalTiles( origin_target ); - } - else { - assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::Matrix::fromScaLAPACK( - m, n, &matrix.A_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD ); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::Matrix::fromScaLAPACK( - m, n, &matrix.Aref_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - } - - return matrix; -} - -// ----------------------------------------------------------------------------- /// Marks the paramters used by allocate_test_HermitianMatrix inline void mark_params_for_test_HermitianMatrix(Params& params) { - params.grid.m(); - params.grid.n(); - params.dev_dist(); params.uplo(); - params.nb(); - params.nonuniform_nb(); - params.origin(); - params.grid_order(); + mark_params_for_test_Matrix( params ); } -// ----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// Marks the paramters used by allocate_test_SymmetricMatrix inline void mark_params_for_test_SymmetricMatrix(Params& params) { mark_params_for_test_HermitianMatrix( params ); } +//------------------------------------------------------------------------------ +/// Marks the paramters used by allocate_test_HermitianMatrix +inline void mark_params_for_test_TriangularMatrix(Params& params) +{ + params.uplo(); + params.diag(); + mark_params_for_test_Matrix( params ); +} -// ----------------------------------------------------------------------------- -/// Helper routine to avoid duplicating logic between HermitianMatrix and SymmetricMatrix -/// -template -TestMatrix allocate_test_HeSyMatrix( +template +TestMatrix> allocate_test_Matrix( bool ref_matrix, bool nonuniform_ref, + int64_t m, int64_t n, - Params& params) -{ - // Load params variables - slate::Uplo uplo = params.uplo(); - int p = params.grid.m(); - int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix matrix ( n, n, nb, p, q, grid_order ); - - // Functions for nonuniform tile sizes or row device distribution - std::function< int64_t (int64_t j) > tileNb; - if (nonuniform_nb) { - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb*2 : nb); - }; - } - else { - tileNb = slate::func::uniform_blocksize( n, nb ); - } - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = matrixtype( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = matrixtype( - uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD); - } + Params& params); - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - matrix.A.insertLocalTiles(origin_target); - } - else { - assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = matrixtype::fromScaLAPACK( - uplo, n, &matrix.A_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.A = matrixtype( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = matrixtype::fromScaLAPACK( - uplo, n, &matrix.Aref_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - } - - return matrix; -} - -// ----------------------------------------------------------------------------- -/// Allocates a HermitianMatrix and a reference version for testing. -/// -/// @param ref_matrix[in] -/// Whether to allocate a reference matrix -/// -/// @param nonuniform_ref[in] -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param m[in] -/// The number of rows -/// -/// @param n[in] -/// The number of columns -/// -/// @param params[in] -/// The test params object which contains many of the key parameters -/// template TestMatrix> allocate_test_HermitianMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, - Params& params) -{ - return allocate_test_HeSyMatrix>( - ref_matrix, nonuniform_ref, n, params ); -} + Params& params); -// ----------------------------------------------------------------------------- -/// Allocates a SymmetricMatrix and a reference version for testing. -/// -/// @param ref_matrix[in] -/// Whether to allocate a reference matrix -/// -/// @param nonuniform_ref[in] -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param m[in] -/// The number of rows -/// -/// @param n[in] -/// The number of columns -/// -/// @param params[in] -/// The test params object which contains many of the key parameters -/// template TestMatrix> allocate_test_SymmetricMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, - Params& params) -{ - return allocate_test_HeSyMatrix>( - ref_matrix, nonuniform_ref, n, params ); -} + Params& params); -// ----------------------------------------------------------------------------- -/// Marks the paramters used by allocate_test_HermitianMatrix -inline void mark_params_for_test_TriangularMatrix(Params& params) -{ - params.grid.m(); - params.grid.n(); - params.dev_dist(); - params.uplo(); - params.diag(); - params.nb(); - params.nonuniform_nb(); - params.origin(); - params.grid_order(); -} - -// ----------------------------------------------------------------------------- -/// Allocates a SymmetricMatrix and a reference version for testing. -/// -/// @param ref_matrix[in] -/// Whether to allocate a reference matrix -/// -/// @param nonuniform_ref[in] -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param m[in] -/// The number of rows -/// -/// @param n[in] -/// The number of columns -/// -/// @param params[in] -/// The test params object which contains many of the key parameters -/// template TestMatrix> allocate_test_TriangularMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, - Params& params) -{ - // Load params variables - slate::Uplo uplo = params.uplo(); - slate::Diag diag = params.diag(); - int p = params.grid.m(); - int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix> matrix ( n, n, nb, p, q, grid_order ); - - // Functions for nonuniform tile sizes or row device distribution - std::function< int64_t (int64_t j) > tileNb; - if (nonuniform_nb) { - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb*2 : nb); - }; - } - else { - tileNb = slate::func::uniform_blocksize( n, nb ); - } - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = slate::TriangularMatrix( - uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = slate::TriangularMatrix( - uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD); - } - - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - matrix.A.insertLocalTiles(origin_target); - } - else { - assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, n, &matrix.A_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = slate::TriangularMatrix( - uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - } - - return matrix; -} - + Params& params); #endif // SLATE_MATRIX_UTILS_HH From cca1384c07cf8d4f4bbcce63d7f51ac39b9a4b46 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 19 Dec 2023 14:08:04 -0500 Subject: [PATCH 15/33] Deduplicate code in matrix allocate --- test/matrix_utils.cc | 290 +++++++++++++++++-------------------------- 1 file changed, 116 insertions(+), 174 deletions(-) diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc index b157e97ee..2e126149c 100644 --- a/test/matrix_utils.cc +++ b/test/matrix_utils.cc @@ -5,32 +5,22 @@ #include "matrix_utils.hh" +using nb_func_t = std::function< int64_t(int64_t) >; +using dist_func_t = std::function< int(std::tuple) >; + //------------------------------------------------------------------------------ -/// Allocates a Matrix and optionally a reference version for testing. -/// -/// @param[in] ref_matrix -/// Whether to allocate a reference matrix -/// -/// @param[in] nonuniform_ref -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param[in] m -/// The number of rows -/// -/// @param[in] n -/// The number of columns -/// -/// @param[in] params -/// The test params object which contains many of the key parameters -/// -template -TestMatrix> allocate_test_Matrix( +/// Shared logic of the allocate_test_* routines +template +static TestMatrix allocate_test_shared( bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, - Params& params) + Params& params, + irregular_constructor_t construct_irregular, + regular_constructor_t construct_regular, + scalapack_constructor_t construct_scalapack) { // Load params variables int p = params.grid.m(); @@ -42,10 +32,10 @@ TestMatrix> allocate_test_Matrix( slate::GridOrder grid_order = params.grid_order(); // The object to be returned - TestMatrix> matrix( m, n, nb, p, q, grid_order ); + TestMatrix matrix( m, n, nb, p, q, grid_order ); // Functions for nonuniform tile sizes or row device distribution - std::function< int64_t (int64_t j) > tileMb, tileNb; + nb_func_t tileMb, tileNb; if (nonuniform_nb) { tileNb = [nb](int64_t j) { // for non-uniform tile size @@ -54,8 +44,8 @@ TestMatrix> allocate_test_Matrix( tileMb = tileNb; } else { - tileMb = slate::func::uniform_blocksize( m, nb ); tileNb = slate::func::uniform_blocksize( n, nb ); + tileMb = (m == n) ? tileNb : slate::func::uniform_blocksize( m, nb ); } auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); int num_devices_ = blas::get_device_count(); @@ -67,12 +57,10 @@ TestMatrix> allocate_test_Matrix( // SLATE allocates CPU or GPU tiles. slate::Target origin_target = origin2target( origin ); if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = slate::Matrix( m, n, tileMb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); + matrix.A = construct_irregular( tileMb, tileNb, tileRank, tileDevice ); } else { - matrix.A = slate::Matrix( m, n, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); + matrix.A = construct_regular( nb, grid_order, p, q ); } matrix.A.insertLocalTiles( origin_target ); } @@ -81,30 +69,75 @@ TestMatrix> allocate_test_Matrix( assert( dev_dist == slate::Dist::Row ); // Create SLATE matrix from the ScaLAPACK layouts matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::Matrix::fromScaLAPACK( - m, n, &matrix.A_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); + matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q ); } // Setup reference matrix if (ref_matrix) { if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD ); + matrix.Aref = construct_irregular( tileMb, tileNb, tileRank, tileDevice ); matrix.Aref.insertLocalTiles( slate::Target::Host ); } else { matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::Matrix::fromScaLAPACK( - m, n, &matrix.Aref_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); + matrix.Aref = construct_scalapack( &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q ); } } return matrix; } -//------------------------------------------------------------------------------ +// ----------------------------------------------------------------------------- +/// Allocates a Matrix and optionally a reference version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, + bool nonuniform_ref, + int64_t m, + int64_t n, + Params& params) +{ + auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::Matrix( m, n, tileMb, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::Matrix( m, n, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::Matrix::fromScaLAPACK( + m, n, data, lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, m, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} // Explicit instantiations. template TestMatrix> allocate_test_Matrix( @@ -142,83 +175,37 @@ TestMatrix>> allocate_test_Matrix -TestMatrix allocate_test_HeSyMatrix( +template +TestMatrix allocate_test_HeSyMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) { - // Load params variables - slate::Uplo uplo = params.uplo(); - int p = params.grid.m(); - int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix matrix ( n, n, nb, p, q, grid_order ); - - // Functions for nonuniform tile sizes or row device distribution - std::function< int64_t (int64_t j) > tileNb; - if (nonuniform_nb) { - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb*2 : nb); - }; - } - else { - tileNb = slate::func::uniform_blocksize( n, nb ); - } - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = matrixtype( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = matrixtype( - uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD); - } - - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - matrix.A.insertLocalTiles(origin_target); - } - else { - assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = matrixtype::fromScaLAPACK( - uplo, n, &matrix.A_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } + using scalar_t = typename matrix_type::value_type; - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.A = matrixtype( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = matrixtype::fromScaLAPACK( - uplo, n, &matrix.Aref_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - } + slate::Uplo uplo = params.uplo(); - return matrix; + auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return matrix_type( uplo, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return matrix_type( uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return matrix_type::fromScaLAPACK( uplo, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared( + ref_matrix, nonuniform_ref, n, n, params, + construct_irregular, construct_regular, construct_scalapack ); } //------------------------------------------------------------------------------ @@ -374,74 +361,29 @@ TestMatrix> allocate_test_TriangularMatrix( // Load params variables slate::Uplo uplo = params.uplo(); slate::Diag diag = params.diag(); - int p = params.grid.m(); - int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix> matrix ( n, n, nb, p, q, grid_order ); - // Functions for nonuniform tile sizes or row device distribution - std::function< int64_t (int64_t j) > tileNb; - if (nonuniform_nb) { - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb*2 : nb); - }; - } - else { - tileNb = slate::func::uniform_blocksize( n, nb ); - } - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = slate::TriangularMatrix( - uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = slate::TriangularMatrix( - uplo, diag, n, nb, grid_order, p, q, MPI_COMM_WORLD); - } - - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - matrix.A.insertLocalTiles(origin_target); - } - else { - assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, n, &matrix.A_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = slate::TriangularMatrix( - uplo, diag, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, n, &matrix.Aref_data[0], matrix.lld, nb, - grid_order, p, q, MPI_COMM_WORLD); - } - } - - return matrix; + auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::TriangularMatrix( uplo, diag, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::TriangularMatrix( uplo, diag, n, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, n, n, params, + construct_irregular, construct_regular, construct_scalapack ); } //------------------------------------------------------------------------------ From 3783b19e316e7991644ee8f8b1cdf01dac264973 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 19 Dec 2023 14:13:01 -0500 Subject: [PATCH 16/33] Tighten some function signatures --- test/matrix_utils.cc | 122 +++++++++---------------------------------- test/matrix_utils.hh | 21 ++------ 2 files changed, 29 insertions(+), 114 deletions(-) diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc index 2e126149c..df9c1022a 100644 --- a/test/matrix_utils.cc +++ b/test/matrix_utils.cc @@ -13,14 +13,10 @@ using dist_func_t = std::function< int(std::tuple) >; template static TestMatrix allocate_test_shared( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params, - irregular_constructor_t construct_irregular, - regular_constructor_t construct_regular, - scalapack_constructor_t construct_scalapack) + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params, + irregular_constructor_t construct_irregular, + regular_constructor_t construct_regular, + scalapack_constructor_t construct_scalapack) { // Load params variables int p = params.grid.m(); @@ -110,11 +106,7 @@ static TestMatrix allocate_test_shared( /// template TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params) + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params) { auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, dist_func_t tileRank, dist_func_t tileDevice) @@ -141,35 +133,19 @@ TestMatrix> allocate_test_Matrix( // Explicit instantiations. template TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); template TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); template TestMatrix>> allocate_test_Matrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); template TestMatrix>> allocate_test_Matrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); //------------------------------------------------------------------------------ /// Helper routine to avoid duplicating logic between HermitianMatrix @@ -177,10 +153,7 @@ TestMatrix>> allocate_test_Matrix TestMatrix allocate_test_HeSyMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params) + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) { using scalar_t = typename matrix_type::value_type; @@ -230,10 +203,7 @@ TestMatrix allocate_test_HeSyMatrix( /// template TestMatrix> allocate_test_HermitianMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params) + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) { return allocate_test_HeSyMatrix>( ref_matrix, nonuniform_ref, n, params ); @@ -243,31 +213,19 @@ TestMatrix> allocate_test_HermitianMatrix( // Explicit instantiations. template TestMatrix> allocate_test_HermitianMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix> allocate_test_HermitianMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_HermitianMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_HermitianMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); //------------------------------------------------------------------------------ /// Allocates a SymmetricMatrix and optionally a reference @@ -291,10 +249,7 @@ TestMatrix>> allocate_test_Hermitian /// template TestMatrix> allocate_test_SymmetricMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params) + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) { return allocate_test_HeSyMatrix>( ref_matrix, nonuniform_ref, n, params ); @@ -304,31 +259,19 @@ TestMatrix> allocate_test_SymmetricMatrix( // Explicit instantiations. template TestMatrix> allocate_test_SymmetricMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix> allocate_test_SymmetricMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_SymmetricMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_SymmetricMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); //------------------------------------------------------------------------------ @@ -353,10 +296,7 @@ TestMatrix>> allocate_test_Symmetric /// template TestMatrix> allocate_test_TriangularMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params) + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) { // Load params variables slate::Uplo uplo = params.uplo(); @@ -390,28 +330,16 @@ TestMatrix> allocate_test_TriangularMatrix( // Explicit instantiations. template TestMatrix> allocate_test_TriangularMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix> allocate_test_TriangularMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_TriangularMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix>> allocate_test_TriangularMatrix>( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index b3db03c62..f92576c84 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -298,31 +298,18 @@ inline void mark_params_for_test_TriangularMatrix(Params& params) template TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); template TestMatrix> allocate_test_HermitianMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix> allocate_test_SymmetricMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); template TestMatrix> allocate_test_TriangularMatrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t n, - Params& params); + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); #endif // SLATE_MATRIX_UTILS_HH From 88b497e3e86417ef6d1d85868dffe07159b79039 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 19 Dec 2023 14:27:38 -0500 Subject: [PATCH 17/33] Refactor some test/run_tests.py parameters into variables --- test/run_tests.py | 137 ++++++++++++++++++------------------ test/test_svd.cc | 173 +++++++++++++--------------------------------- 2 files changed, 119 insertions(+), 191 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index 1de26268b..dcf5d7324 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -316,6 +316,11 @@ gen_no_nb = origin + target + grid + check + ref + tol + repeat gen_no_target = grid + check + ref + tol + repeat + nb +ge_matrix = ddist + grid_order +sy_matrix = uplo + ddist + grid_order +he_matrix = uplo + ddist + grid_order +tr_matrix = uplo + diag + ddist + grid_order + if (opts.matrix): gen += matrix @@ -350,67 +355,67 @@ def filter_csv( values, csv ): cmds += [ [ 'gbmm', gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ], - [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], - [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], - [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ddist + grid_order ], + [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], + [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], + [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], - [ 'hemm', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], + [ 'hemm', gen + dtype + la + side + he_matrix + mn + ab + matrixBC ], # todo: hemmA GPU support - [ 'hemmA', gen_no_target + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], - [ 'hemmC', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC], + [ 'hemmA', gen_no_target + dtype + la + side + he_matrix + mn + ab + matrixBC ], + [ 'hemmC', gen + dtype + la + side + he_matrix + mn + ab + matrixBC], [ 'hbmm', gen + dtype + la + side + uplo + mn + ab + kd + matrixBC ], - [ 'herk', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixC ], - [ 'herk', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixC ], + [ 'herk', gen + dtype_real + la + he_matrix + trans + mn + ab + matrixC ], + [ 'herk', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixC ], - [ 'her2k', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixBC ], - [ 'her2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nc + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_real + la + he_matrix + trans + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixBC ], - [ 'symm', gen + dtype + la + side + uplo + ddist + grid_order + mn + ab + matrixBC ], + [ 'symm', gen + dtype + la + side + sy_matrix + mn + ab + matrixBC ], - [ 'syr2k', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixC ], - [ 'syr2k', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_real + la + sy_matrix + trans + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixC ], - [ 'syrk', gen + dtype_real + la + uplo + ddist + grid_order + trans + mn + ab + matrixBC ], - [ 'syrk', gen + dtype_complex + la + uplo + ddist + grid_order + trans_nt + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_real + la + sy_matrix + trans + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixBC ], # todo: tbsm fails for nb=8 or 16 with --quick. [ 'tbsm', gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ], - [ 'trmm', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], + [ 'trmm', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], - [ 'trsm', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], - [ 'trsmA', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], - [ 'trsmB', gen + dtype + la + side + uplo + ddist + grid_order + nonuniform_nb + transA + diag + mn + a + matrixB ], + [ 'trsm', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], + [ 'trsmA', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], + [ 'trsmB', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], ] # LU if (opts.lu): cmds += [ - [ 'gesv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ], - [ 'gesv_tntpiv', gen + dtype + la + n + ddist + grid_order ], - [ 'gesv_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + [ 'gesv', gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ], + [ 'gesv_tntpiv', gen + dtype + la + n + ge_matrix ], + [ 'gesv_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], # todo: mn - [ 'getrf', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + thresh ], - [ 'getrf_tntpiv', gen + dtype + la + n + ddist + grid_order ], - [ 'getrf_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + [ 'getrf', gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ], + [ 'getrf_tntpiv', gen + dtype + la + n + ge_matrix ], + [ 'getrf_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getrs', gen + dtype + la + n + trans + ddist + grid_order + nonuniform_nb + thresh ], - [ 'getrs_tntpiv', gen + dtype + la + n + ddist + grid_order ], - [ 'getrs_nopiv', gen + dtype + la + n + ddist + grid_order + nonuniform_nb + [ 'getrs', gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ], + [ 'getrs_tntpiv', gen + dtype + la + n + ge_matrix ], + [ 'getrs_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getri', gen + dtype + la + n + ddist + grid_order ], - [ 'getriOOP', gen + dtype + la + n + ddist + grid_order ], + [ 'getri', gen + dtype + la + n + ge_matrix ], + [ 'getriOOP', gen + dtype + la + n + ge_matrix ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], - [ 'gesv_mixed', gen + dtype_double + la + n + ddist + grid_order + nonuniform_nb ], - [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + ddist + grid_order + nonuniform_nb ], - [ 'gesv_rbt', gen + dtype + la + n + ddist + grid_order ], + [ 'gesv_mixed', gen + dtype_double + la + n + ge_matrix + nonuniform_nb ], + [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + ge_matrix + nonuniform_nb ], + [ 'gesv_rbt', gen + dtype + la + n + ge_matrix ], ] # LU banded @@ -426,14 +431,14 @@ def filter_csv( values, csv ): # Cholesky if (opts.chol): cmds += [ - [ 'posv', gen + dtype + la + n + uplo + ddist + grid_order ], - [ 'potrf', gen + dtype + la + n + uplo + ddist + grid_order ], - [ 'potrs', gen + dtype + la + n + uplo + ddist + grid_order ], - [ 'potri', gen + dtype + la + n + uplo + ddist + grid_order ], + [ 'posv', gen + dtype + la + n + he_matrix ], + [ 'potrf', gen + dtype + la + n + he_matrix ], + [ 'potrs', gen + dtype + la + n + he_matrix ], + [ 'potri', gen + dtype + la + n ], #[ 'porfs', gen + dtype + la + n + uplo ], #[ 'poequ', gen + dtype + la + n ], # only diagonal elements (no uplo) - [ 'posv_mixed', gen + dtype_double + la + n + uplo + ddist + grid_order ], - [ 'posv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + uplo + ddist + grid_order ], + [ 'posv_mixed', gen + dtype_double + la + n + he_matrix ], + [ 'posv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + he_matrix ], [ 'trtri', gen + dtype + la + n + uplo + diag ], ] @@ -587,9 +592,9 @@ def filter_csv( values, csv ): # svd if (opts.svd): if ('n' in jobu): - cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' ]] + cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' + ge_matrix ]] if ('v' in jobu): - cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' ]] + cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' + ge_matrix ]] cmds += [ # todo: mn (wide), nb, jobu, jobvt @@ -630,31 +635,31 @@ def filter_csv( values, csv ): # aux if (opts.aux): cmds += [ - [ 'add', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], - [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'syadd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'headd', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - - [ 'copy', gen + dtype + mn + nonuniform_nb + ddist + grid_order ], - [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ddist + grid_order + uplo ], - [ 'trcopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], - [ 'sycopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], - [ 'hecopy', gen + dtype + n + nonuniform_nb + ddist + grid_order + uplo ], - - [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], - [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'syscale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'hescale', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - - [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ddist + grid_order ], - - [ 'set', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order ], - [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'trset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'syset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], - [ 'heset', gen + dtype + n + ab + nonuniform_nb + ddist + grid_order + uplo ], + [ 'add', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syadd', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'headd', gen + dtype + n + ab + nonuniform_nb + he_matrix ], + + [ 'copy', gen + dtype + mn + nonuniform_nb + ge_matrix ], + [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ge_matrix + uplo ], + [ 'trcopy', gen + dtype + n + nonuniform_nb + ge_matrix + uplo ], + [ 'sycopy', gen + dtype + n + nonuniform_nb + sy_matrix ], + [ 'hecopy', gen + dtype + n + nonuniform_nb + he_matrix ], + + [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syscale', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'hescale', gen + dtype + n + ab + nonuniform_nb + he_matrix ], + + [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ge_matrix ], + + [ 'set', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'trset', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syset', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'heset', gen + dtype + n + ab + nonuniform_nb + he_matrix ], ] # ------------------------------------------------------------------------------ diff --git a/test/test_svd.cc b/test/test_svd.cc index 2a1fb276b..7189293a1 100644 --- a/test/test_svd.cc +++ b/test/test_svd.cc @@ -8,10 +8,12 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" + #include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include @@ -35,9 +37,6 @@ void test_svd_work( Params& params, bool run ) lapack::Job jobvt = params.jobvt(); int64_t m = params.dim.m(); int64_t n = params.dim.n(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); - int64_t nb = params.nb(); int64_t ib = params.ib(); int64_t panel_threads = params.panel_threads(); int64_t lookahead = params.lookahead(); @@ -51,6 +50,10 @@ void test_svd_work( Params& params, bool run ) slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // nonuniform nb is not always supported in the reduction to band + params.nonuniform_nb.used( false ); + params.time(); params.ref_time(); params.error2(); @@ -86,6 +89,11 @@ void test_svd_work( Params& params, bool run ) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + if (check && ! ref && (jobu == slate::Job::NoVec || jobvt == slate::Job::NoVec)) { params.msg() = "job = NoVec requires --ref y to check singular values"; @@ -98,48 +106,6 @@ void test_svd_work( Params& params, bool run ) {slate::Option::InnerBlocking, ib} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - int64_t min_mn = std::min(m, n); - - // Figure out local size. - // matrix A (local input), m-by-n - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data; - std::vector Acpy_data; - - // U is either m-by-min( m, n ) for some vec, or m-by-m for all vec; - // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec. - int64_t Um = m; - int64_t Un = jobu == slate::Job::AllVec ? m : min_mn; - int64_t VTm = jobvt == slate::Job::AllVec ? n : min_mn; - int64_t VTn = n; - - // matrix U (local output), U(m, min_mn), singular values of A - int64_t mlocU = num_local_rows_cols( Um, nb, myrow, p ); - int64_t nlocU = num_local_rows_cols( Un, nb, mycol, q ); - int64_t lldU = blas::max(1, mlocU); // local leading dimension of U - std::vector U_data(1); - - // matrix VT (local output), VT(min_mn, n) - int64_t mlocVT = num_local_rows_cols( VTm, nb, myrow, p ); - int64_t nlocVT = num_local_rows_cols( VTn, nb, mycol, q ); - int64_t lldVT = blas::max(1, mlocVT); // local leading dimension of VT - std::vector VT_data(1); - - // array Sigma (global output), singular values of A - std::vector Sigma(min_mn); - - slate::Matrix A; // (m, n); - slate::Matrix U; // (m, min_mn); - slate::Matrix VT; // (min_mn, n); - slate::Matrix Acpy; - bool wantu = (jobu == slate::Job::Vec || jobu == slate::Job::AllVec || jobu == slate::Job::SomeVec); @@ -147,45 +113,28 @@ void test_svd_work( Params& params, bool run ) || jobvt == slate::Job::AllVec || jobvt == slate::Job::SomeVec); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + int64_t min_mn = std::min(m, n); - Acpy = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - Acpy.insertLocalTiles(origin_target); + // U is either m-by-min( m, n ) for some vec, or m-by-m for all vec; + // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec. + int64_t Um = wantu ? m : 0; + int64_t Un = wantu ? (jobu == slate::Job::AllVec ? m : min_mn) : 0; + int64_t VTm = wantvt ? (jobvt == slate::Job::AllVec ? n : min_mn) : 0; + int64_t VTn = wantvt ? n : 0; - if (wantu) { - U = slate::Matrix( Um, Un, nb, p, q, MPI_COMM_WORLD ); - U.insertLocalTiles(origin_target); - } - if (wantvt) { - VT = slate::Matrix( VTm, VTn, nb, p, q, MPI_COMM_WORLD ); - VT.insertLocalTiles(origin_target); - } - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A_data.resize( lldA * nlocA ); - A = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + // array Sigma (global output), singular values of A + std::vector Sigma(min_mn); - Acpy_data.resize( lldA * nlocA ); - Acpy = slate::Matrix::fromScaLAPACK( - m, n, &Acpy_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + auto A_alloc = allocate_test_Matrix( check || ref, true, m, n, params ); + auto U_alloc = allocate_test_Matrix( false, true, Um, Un, params ); + auto VT_alloc = allocate_test_Matrix( false, true, VTm, VTn, params ); + // TODO Acpy isn't always needed + auto Acpy_alloc = allocate_test_Matrix( false, true, m, n, params ); - if (wantu) { - U_data.resize(lldU*nlocU); - U = slate::Matrix::fromScaLAPACK( - Um, Un, &U_data[0], lldU, nb, p, q, MPI_COMM_WORLD ); - } - if (wantvt) { - VT_data.resize(lldVT*nlocVT); - VT = slate::Matrix::fromScaLAPACK( - VTm, VTn, &VT_data[0], lldVT, nb, p, q, MPI_COMM_WORLD ); - } - } + auto& A = A_alloc.A; + auto& U = U_alloc.A; + auto& VT = VT_alloc.A; + auto& Acpy = Acpy_alloc.A; if (verbose >= 1) { printf( "%% A %6lld-by-%6lld\n", llong( A.m() ), llong( A.n() ) ); @@ -201,15 +150,10 @@ void test_svd_work( Params& params, bool run ) slate::generate_matrix( params.matrix, A); print_matrix( "A", A, params ); - slate::Matrix Aref; std::vector Sigma_ref; - std::vector Aref_data; if (check || ref) { Sigma_ref.resize( min_mn ); - Aref_data.resize( lldA * nlocA ); - Aref = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - slate::copy( A, Aref ); + slate::copy( A, A_alloc.Aref ); slate::copy( A, Acpy ); } @@ -272,8 +216,8 @@ void test_svd_work( Params& params, bool run ) Rm = blas::max( Rm, m ); if (jobvt == slate::Job::AllVec) Rm = blas::max( Rm, n ); - slate::Matrix R( Rm, Rm, nb, p, q, MPI_COMM_WORLD ); - R.insertLocalTiles(); + auto R_alloc = allocate_test_Matrix( false, true, Rm, Rm, params ); + auto R = R_alloc.A; if (wantu) { //================================================== @@ -344,42 +288,21 @@ void test_svd_work( Params& params, bool run ) #ifdef SLATE_HAVE_SCALAPACK // Run reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - blas_int A_desc[9]; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - blas_int U_desc[9]; - scalapack_descinit(U_desc, m, min_mn, nb, nb, 0, 0, ictxt, mlocU, &info); - slate_assert(info == 0); - - blas_int VT_desc[9]; - scalapack_descinit(VT_desc, min_mn, n, nb, nb, 0, 0, ictxt, mlocVT, &info); - slate_assert(info == 0); - - // Allocate U and VT if not already allocated. - // If origin=scalapack, just overwrite SLATE's U and VT. - if (wantu) { - U_data.resize( lldU * nlocU ); - } - if (wantvt) { - VT_data.resize( lldVT * nlocVT ); + blas_int ictxt, A_desc[9], U_desc[9], VT_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + U_alloc.ScaLAPACK_descriptor( ictxt, U_desc ); + VT_alloc.ScaLAPACK_descriptor( ictxt, VT_desc ); + + auto& Aref_data = A_alloc.Aref_data; + auto& U_data = U_alloc.A_data; + auto& VT_data = VT_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK) { + U_data.resize( U_alloc.lld * U_alloc.nloc ); + VT_data.resize( VT_alloc.lld * VT_alloc.nloc ); } // ScaLAPACK uses job = N and V (same as S); @@ -444,7 +367,7 @@ void test_svd_work( Params& params, bool run ) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } From caffee312ad512b9f8a38d16291c8d43d1254d2a Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 27 Dec 2023 15:57:56 -0500 Subject: [PATCH 18/33] FIXUP 9c933fcf5 --- test/run_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index dcf5d7324..f08073217 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -409,8 +409,8 @@ def filter_csv( values, csv ): [ 'getrs_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getri', gen + dtype + la + n + ge_matrix ], - [ 'getriOOP', gen + dtype + la + n + ge_matrix ], + [ 'getri', gen + dtype + la + n ], + [ 'getriOOP', gen + dtype + la + n ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], [ 'gesv_mixed', gen + dtype_double + la + n + ge_matrix + nonuniform_nb ], From 60421d09ed0d0606bece457fab253cf01922e73a Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 20 Dec 2023 18:02:57 -0500 Subject: [PATCH 19/33] Add options to norm testers --- test/matrix_utils.cc | 103 +++++++++++--- test/matrix_utils.hh | 15 +- test/run_tests.py | 12 +- test/test_genorm.cc | 272 +++++++++++++++++------------------- test/test_henorm.cc | 302 +++++++++++++++++++-------------------- test/test_scale.cc | 1 - test/test_synorm.cc | 302 +++++++++++++++++++-------------------- test/test_trnorm.cc | 326 +++++++++++++++++++------------------------ test/test_utils.hh | 70 +++++++++- 9 files changed, 726 insertions(+), 677 deletions(-) diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc index df9c1022a..0de0e57e9 100644 --- a/test/matrix_utils.cc +++ b/test/matrix_utils.cc @@ -31,18 +31,21 @@ static TestMatrix allocate_test_shared( TestMatrix matrix( m, n, nb, p, q, grid_order ); // Functions for nonuniform tile sizes or row device distribution - nb_func_t tileMb, tileNb; + nb_func_t tileNb; if (nonuniform_nb) { tileNb = [nb](int64_t j) { // for non-uniform tile size return (j % 2 != 0 ? nb*2 : nb); }; - tileMb = tileNb; } else { - tileNb = slate::func::uniform_blocksize( n, nb ); - tileMb = (m == n) ? tileNb : slate::func::uniform_blocksize( m, nb ); + // NB. we let BaseMatrix truncate the final tile length + // TrapezoidMatrix only takes 1 function for both dimensions (of different sizes) + tileNb = [nb](int64_t j) { + return nb; + }; } + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); int num_devices_ = blas::get_device_count(); auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), @@ -53,7 +56,7 @@ static TestMatrix allocate_test_shared( // SLATE allocates CPU or GPU tiles. slate::Target origin_target = origin2target( origin ); if (nonuniform_nb || dev_dist == slate::Dist::Col) { - matrix.A = construct_irregular( tileMb, tileNb, tileRank, tileDevice ); + matrix.A = construct_irregular( tileNb, tileRank, tileDevice ); } else { matrix.A = construct_regular( nb, grid_order, p, q ); @@ -72,7 +75,7 @@ static TestMatrix allocate_test_shared( // Setup reference matrix if (ref_matrix) { if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = construct_irregular( tileMb, tileNb, tileRank, tileDevice ); + matrix.Aref = construct_irregular( tileNb, tileRank, tileDevice ); matrix.Aref.insertLocalTiles( slate::Target::Host ); } else { @@ -108,10 +111,10 @@ template TestMatrix> allocate_test_Matrix( bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params) { - auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + auto construct_irregular = [&] (nb_func_t tileNb, dist_func_t tileRank, dist_func_t tileDevice) { - return slate::Matrix( m, n, tileMb, tileNb, + return slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD ); }; auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) @@ -159,7 +162,7 @@ TestMatrix allocate_test_HeSyMatrix( slate::Uplo uplo = params.uplo(); - auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + auto construct_irregular = [&] (nb_func_t tileNb, dist_func_t tileRank, dist_func_t tileDevice) { return matrix_type( uplo, n, tileNb, @@ -192,9 +195,6 @@ TestMatrix allocate_test_HeSyMatrix( /// If params.nonuniform_nb(), whether to also allocate the reference matrix /// with non-uniform tiles. /// -/// @param[in] m -/// The number of rows -/// /// @param[in] n /// The number of columns /// @@ -238,9 +238,6 @@ TestMatrix>> allocate_test_Hermitian /// If params.nonuniform_nb(), whether to also allocate the reference matrix /// with non-uniform tiles. /// -/// @param[in] m -/// The number of rows -/// /// @param[in] n /// The number of columns /// @@ -275,7 +272,7 @@ TestMatrix>> allocate_test_Symmetric //------------------------------------------------------------------------------ -/// Allocates a SymmetricMatrix and optionally a reference +/// Allocates a TriangularMatrix and optionally a reference /// version for testing. /// /// @param[in] ref_matrix @@ -285,9 +282,6 @@ TestMatrix>> allocate_test_Symmetric /// If params.nonuniform_nb(), whether to also allocate the reference matrix /// with non-uniform tiles. /// -/// @param[in] m -/// The number of rows -/// /// @param[in] n /// The number of columns /// @@ -302,7 +296,7 @@ TestMatrix> allocate_test_TriangularMatrix( slate::Uplo uplo = params.uplo(); slate::Diag diag = params.diag(); - auto construct_irregular = [&] (nb_func_t tileMb, nb_func_t tileNb, + auto construct_irregular = [&] (nb_func_t tileNb, dist_func_t tileRank, dist_func_t tileDevice) { return slate::TriangularMatrix( uplo, diag, n, tileNb, @@ -343,3 +337,72 @@ TestMatrix>> allocate_test_Triangula template TestMatrix>> allocate_test_TriangularMatrix>( bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +//------------------------------------------------------------------------------ +/// Allocates a TrapezoidMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + slate::Diag diag = params.diag(); + + auto construct_irregular = [&] (nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::TrapezoidMatrix( uplo, diag, m, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::TrapezoidMatrix( uplo, diag, m, n, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::TrapezoidMatrix::fromScaLAPACK( + uplo, diag, m, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, m, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TrapezoidMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TrapezoidMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index f92576c84..247a8ef12 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -288,7 +288,7 @@ inline void mark_params_for_test_SymmetricMatrix(Params& params) } //------------------------------------------------------------------------------ -/// Marks the paramters used by allocate_test_HermitianMatrix +/// Marks the paramters used by allocate_test_TriangularMatrix inline void mark_params_for_test_TriangularMatrix(Params& params) { params.uplo(); @@ -296,6 +296,15 @@ inline void mark_params_for_test_TriangularMatrix(Params& params) mark_params_for_test_Matrix( params ); } +// ----------------------------------------------------------------------------- +/// Marks the paramters used by allocate_test_TrapezoidMatrix +inline void mark_params_for_test_TrapezoidMatrix(Params& params) +{ + params.uplo(); + params.diag(); + mark_params_for_test_Matrix( params ); +} + template TestMatrix> allocate_test_Matrix( bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); @@ -312,4 +321,8 @@ template TestMatrix> allocate_test_TriangularMatrix( bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/run_tests.py b/test/run_tests.py index f08073217..d0ce7aace 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -298,12 +298,12 @@ origin = ' --origin ' + opts.origin if (opts.origin) else '' target = ' --target ' + opts.target if (opts.target) else '' la = ' --lookahead ' + opts.lookahead if (opts.lookahead) else '' -ddist = ' --dev-dist ' + opts.dev_dist if (opts.dev_dist) else '' +ddist = ' --dev-dist ' + opts.dev_dist if (opts.dev_dist) else '' nb = ' --nb ' + opts.nb if (opts.nb) else '' nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else '' nt = ' --nt ' + opts.nt if (opts.nt) else '' grid = ' --grid ' + opts.grid if (opts.grid) else '' -grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' +grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' repeat = ' --repeat ' + opts.repeat if (opts.repeat) else '' thresh = ' --thresh ' + opts.thresh if (opts.thresh) else '' matrix = ' --matrix ' + opts.matrix if (opts.matrix) else '' @@ -607,10 +607,10 @@ def filter_csv( values, csv ): # norms if (opts.norms): cmds += [ - [ 'genorm', gen + dtype + mn + norm ], - [ 'henorm', gen + dtype + n + norm + uplo ], - [ 'synorm', gen + dtype + n + norm + uplo ], - [ 'trnorm', gen + dtype + mn + norm + uplo + diag ], + [ 'genorm', gen + dtype + mn + norm + nonuniform_nb + ge_matrix ], + [ 'henorm', gen + dtype + n + norm + nonuniform_nb + he_matrix ], + [ 'synorm', gen + dtype + n + norm + nonuniform_nb + sy_matrix ], + [ 'trnorm', gen + dtype + mn + norm + nonuniform_nb + tr_matrix ], # Banded [ 'gbnorm', gen + dtype + mn + kl + ku + norm ], diff --git a/test/test_genorm.cc b/test/test_genorm.cc index 5437d9429..31f7683df 100644 --- a/test/test_genorm.cc +++ b/test/test_genorm.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -39,12 +41,17 @@ void test_genorm_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -52,41 +59,26 @@ void test_genorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); + auto A_alloc = allocate_test_Matrix( ref_copy, false, m, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (check || ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - } + slate::generate_matrix(params.matrix, A); - slate::Matrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix(params.matrix, A); - std::vector values; if (scope == slate::NormScope::Columns) { values.resize(A.n()); @@ -136,33 +128,21 @@ void test_genorm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK) { + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + copy(A, &A_data[0], A_desc); } // allocate work space - std::vector worklange(std::max(mlocA, nlocA)); + std::vector worklange(std::max(A_alloc.mloc, A_alloc.nloc)); // (Sca)LAPACK norms don't support trans; map One <=> Inf norm. slate::Norm op_norm = norm; @@ -199,10 +179,6 @@ void test_genorm_work(Params& params, bool run) } time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lange( - // op_norm, - // m, n, &A_data[0], lldA); - if (scope == slate::NormScope::Matrix) { // difference between norms error = std::abs(A_norm - A_norm_ref) / A_norm_ref; @@ -216,7 +192,7 @@ void test_genorm_work(Params& params, bool run) error /= sqrt(m*n); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -238,107 +214,113 @@ void test_genorm_work(Params& params, bool run) //---------- extended tests if (extended && scope == slate::NormScope::Matrix) { - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if mt or nt is small, so check in loops. - int64_t mt = A.mt(); - int64_t nt = A.nt(); - std::set i_indices = { 0, 1, mt - 2, mt - 1 }; - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - i_indices.insert(rand() % mt); - j_indices.insert(rand() % nt); + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : i_indices) { - if (i < 0 || i >= mt) + else { + + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if mt or nt is small, so check in loops. + int64_t mt = A.mt(); + int64_t nt = A.nt(); + std::set i_indices = { 0, 1, mt - 2, mt - 1 }; + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + i_indices.insert(rand() % mt); + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(m - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib) { + for (auto i : i_indices) { + if (i < 0 || i >= mt) + continue; + int64_t ib = std::min(m - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the genorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - A_norm_ref = scalapack_plange( - norm2str(norm), m, n, - &A_data[0], 1, 1, A_desc, - &worklange[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One) { - error /= sqrt(m); - } - else if (norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(m*n); - } + A_norm_ref = scalapack_plange( + norm2str(norm), m, n, + &A_data[0], 1, 1, A_desc, + &worklange[0]); - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One) { + error /= sqrt(m); + } + else if (norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(m*n); } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } + } + + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the genorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } @@ -351,7 +333,7 @@ void test_genorm_work(Params& params, bool run) SLATE_UNUSED( A_norm ); SLATE_UNUSED( extended ); SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_henorm.cc b/test/test_henorm.cc index 08b5fd4bc..d18a38792 100644 --- a/test/test_henorm.cc +++ b/test/test_henorm.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -36,12 +38,17 @@ void test_henorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_HermitianMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -51,44 +58,26 @@ void test_henorm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto A_alloc = allocate_test_HermitianMatrix( ref_copy, false, n, params ); - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::HermitianMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); + slate::generate_matrix(params.matrix, A); - slate::HermitianMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::HermitianMatrix(uplo, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix( params.matrix, A ); - print_matrix("A", A, params); if (trace) slate::trace::Trace::on(); @@ -110,38 +99,28 @@ void test_henorm_work(Params& params, bool run) params.time() = time; #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; + // comparison with reference routine from ScaLAPACK // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); } if (check || ref) { // comparison with reference routine from ScaLAPACK // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; std::vector worklanhe( lwork ); //================================================== @@ -153,10 +132,6 @@ void test_henorm_work(Params& params, bool run) n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lanhe( - // norm, A.uplo(), - // n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One || norm == slate::Norm::Inf) { @@ -166,7 +141,7 @@ void test_henorm_work(Params& params, bool run) error /= n; // = sqrt( n*n ); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -188,119 +163,124 @@ void test_henorm_work(Params& params, bool run) //---------- extended tests if (extended) { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; - std::vector worklanhe(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // allocate work space + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), + scalapack_ilcm( p, q ) / p ); + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; + std::vector worklanhe(lwork); + + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } + + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - A_norm = slate::norm(norm, A, opts); + A_norm = slate::norm(norm, A, opts); - real_t A_norm_ref = scalapack_planhe( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); + real_t A_norm_ref = scalapack_planhe( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } + // difference between norms + real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // Allow for difference, except max norm in real should be exact. + real_t eps = std::numeric_limits::epsilon(); + real_t tol; + if (norm == slate::Norm::Max && ! slate::is_complex::value) + tol = 0; + else + tol = 10*eps; + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } @@ -315,7 +295,7 @@ void test_henorm_work(Params& params, bool run) SLATE_UNUSED( ref ); SLATE_UNUSED( extended ); SLATE_UNUSED( verbose ); - if ((check || ref) && mpi_rank == 0) + if ((check || ref) && A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_scale.cc b/test/test_scale.cc index d6a71c5af..0d05b056d 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" diff --git a/test/test_synorm.cc b/test/test_synorm.cc index 0976e59c5..1145006fe 100644 --- a/test/test_synorm.cc +++ b/test/test_synorm.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -36,12 +38,17 @@ void test_synorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_SymmetricMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -49,44 +56,26 @@ void test_synorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); + auto A_alloc = allocate_test_SymmetricMatrix( ref_copy, false, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } - - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::SymmetricMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); + slate::generate_matrix(params.matrix, A); - slate::SymmetricMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::SymmetricMatrix(uplo, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::SymmetricMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix(params.matrix, A); - print_matrix("A", A, params); if (trace) slate::trace::Trace::on(); @@ -108,38 +97,28 @@ void test_synorm_work(Params& params, bool run) params.time() = time; #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; + // comparison with reference routine from ScaLAPACK // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); } if (check || ref) { // comparison with reference routine from ScaLAPACK // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; std::vector worklansy(lwork); //================================================== @@ -151,10 +130,6 @@ void test_synorm_work(Params& params, bool run) n, &A_data[0], 1, 1, A_desc, &worklansy[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lansy( - // norm, A.uplo(), - // n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One || norm == slate::Norm::Inf) { @@ -164,7 +139,7 @@ void test_synorm_work(Params& params, bool run) error /= n; // = sqrt( n*n ); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -186,119 +161,124 @@ void test_synorm_work(Params& params, bool run) //---------- extended tests if (extended) { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; - std::vector worklansy(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // allocate work space + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), + scalapack_ilcm( p, q ) / p ); + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; + std::vector worklansy(lwork); + + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - real_t A_norm_ref = scalapack_plansy( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklansy[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } + real_t A_norm_ref = scalapack_plansy( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklansy[0]); - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real( peak ), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } + + // Allow for difference, except max norm in real should be exact. + real_t eps = std::numeric_limits::epsilon(); + real_t tol; + if (norm == slate::Norm::Max && ! slate::is_complex::value) + tol = 0; + else + tol = 10*eps; + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real( peak ), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } @@ -313,7 +293,7 @@ void test_synorm_work(Params& params, bool run) SLATE_UNUSED( ref ); SLATE_UNUSED( extended ); SLATE_UNUSED( verbose ); - if ((check || ref) && mpi_rank == 0) + if ((check || ref) && A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc index 8262e6ac9..4cd1a4cbc 100644 --- a/test/test_trnorm.cc +++ b/test/test_trnorm.cc @@ -7,10 +7,12 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" + #include "grid_utils.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -38,12 +40,17 @@ void test_trnorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_TrapezoidMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -51,55 +58,26 @@ void test_trnorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // upper requires m <= n, - // lower requires m >= n - if ((uplo == slate::Uplo::Lower && m < n) || - (uplo == slate::Uplo::Upper && m > n)) { - char buf[255]; - snprintf( buf, sizeof( buf ), "skipping invalid size: %s, %lld-by-%lld", - uplo2str( uplo ), llong( m ), llong( n ) ); - params.msg() = buf; - return; - } - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto A_alloc = allocate_test_TrapezoidMatrix( ref_copy, false, m, n, params ); - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::TrapezoidMatrix A0(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD); + slate::generate_matrix(params.matrix, A); - slate::TrapezoidMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::TrapezoidMatrix(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::TrapezoidMatrix::fromScaLAPACK( - uplo, diag, m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix( params.matrix, A ); - print_matrix("A", A, params); if (trace) @@ -124,38 +102,27 @@ void test_trnorm_work(Params& params, bool run) params.time() = time; #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; + // comparison with reference routine from ScaLAPACK // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - if (info != 0) - printf( "scalapack_descinit info %lld\n", llong( info ) ); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); } + // TODO move the above into this if statement if (check || ref) { // comparison with reference routine from ScaLAPACK // allocate work space - std::vector worklantr(std::max(mlocA, nlocA)); + std::vector worklantr(std::max(A_alloc.mloc, A_alloc.nloc)); //================================================== // Run ScaLAPACK reference routine. @@ -166,10 +133,6 @@ void test_trnorm_work(Params& params, bool run) m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lantr( - // norm, A.uplo(), diag, - // m, n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One) { @@ -182,7 +145,7 @@ void test_trnorm_work(Params& params, bool run) error /= sqrt(m*n); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -204,125 +167,130 @@ void test_trnorm_work(Params& params, bool run) //---------- extended tests if (extended) { - // allocate work space - std::vector worklantr(std::max(mlocA, nlocA)); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if mt or nt is small, so check in loops. - int64_t mt = A.mt(); - int64_t nt = A.nt(); - std::set i_indices = { 0, 1, mt - 2, mt - 1 }; - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - i_indices.insert(rand() % mt); - j_indices.insert(rand() % nt); + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : i_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // allocate work space + std::vector worklantr(std::max(A_alloc.mloc, A_alloc.nloc)); + + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if mt or nt is small, so check in loops. + int64_t mt = A.mt(); + int64_t nt = A.nt(); + std::set i_indices = { 0, 1, mt - 2, mt - 1 }; + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + i_indices.insert(rand() % mt); + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(m - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : i_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(m - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - slate_assert(A_data[ ilocal + jlocal*lldA ] == save); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - real_t A_norm_ref = scalapack_plantr( - norm2str(norm), uplo2str(A.uplo()), diag2str(diag), - m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One) { - error /= sqrt(m); - } - else if (norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(m*n); - } + real_t A_norm_ref = scalapack_plantr( + norm2str(norm), uplo2str(A.uplo()), diag2str(diag), + m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan, - // except in Unit case with i == j and ii == jj, - // where peak shouldn't affect A_norm. - bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One) { + error /= sqrt(m); + } + else if (norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(m*n); + } + + // Allow for difference, except max norm in real should be exact. + real_t eps = std::numeric_limits::epsilon(); + real_t tol; + if (norm == slate::Norm::Max && ! slate::is_complex::value) + tol = 0; + else + tol = 10*eps; + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan, + // except in Unit case with i == j and ii == jj, + // where peak shouldn't affect A_norm. + bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } @@ -338,7 +306,7 @@ void test_trnorm_work(Params& params, bool run) SLATE_UNUSED( ref ); SLATE_UNUSED( extended ); SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_utils.hh b/test/test_utils.hh index c16d334a7..f04ecb4b8 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -13,7 +13,7 @@ /// /// @return true if the configuration should be skipped /// -inline bool is_invalid_parameters(Params& params) +inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = false) { slate::Origin origin = params.origin(); slate::Target target = params.target(); @@ -36,7 +36,7 @@ inline bool is_invalid_parameters(Params& params) } #ifdef SLATE_HAVE_SCALAPACK - if (nonuniform_nb && params.ref()) { + if (!keep_nonuniform_ref && nonuniform_nb && params.ref()) { params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; params.ref() = false; } @@ -128,7 +128,71 @@ void matrix_iterator( } } else { - assert( false ); + bool is_upper = (A.uplo() == slate::Uplo::Upper); + + int64_t A_j = 0, A_jj = 0; + for (int64_t B_j = 0; B_j < B_nt; ++B_j) { + + int64_t A_i = 0, A_ii = 0; + for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + + bool right_uplo = is_upper ? (B_i <= B_j) : (B_i >= B_j); + if (right_uplo) { + + #pragma omp task shared(A, B) firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) + { + int tag = A_i + A_j * A.mt(); + if (B.tileIsLocal( B_i, B_j )) { + A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), + slate::Layout::ColMajor, tag ); + + A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor ); + B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor ); + auto TA = A( A_i, A_j ); + auto TB = B( B_i, B_j ); + int64_t mb = TB.mb(); + int64_t nb = TB.nb(); + assert( A_ii + mb <= TA.mb() ); + assert( A_jj + nb <= TA.nb() ); + int64_t lda = TA.stride(); + int64_t ldb = TB.stride(); + scalar_t const* TA_data = TA.data(); + scalar_t* TB_data = TB.data(); + + for (int64_t jj = 0; jj < nb; ++jj) { + int64_t ii_start = 0, ii_end = mb; + if (B_i == B_j) { // diagonal tile + if (is_upper) + ii_end = std::min(jj+1, mb); + else + ii_start = jj; + } + for (int64_t ii = ii_start; ii < ii_end; ++ii) { + thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ], + TB_data[ ii + jj*ldb ] ); + } + } + } + else if (A.tileIsLocal( A_i, A_j )) { + A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag ); + } + } + } + + A_ii += B.tileMb( B_i ); + assert( A_ii <= A.tileMb( A_i ) ); + if (A_ii == A.tileMb( A_i )) { + ++A_i; + A_ii = 0; + } + } + A_jj += B.tileNb( B_j ); + assert( A_jj <= A.tileNb( A_j ) ); + if (A_jj == A.tileNb( A_j )) { + ++A_j; + A_jj = 0; + } + } } A.releaseRemoteWorkspace(); From 6b7636b2a1676d5880a4c25c4dcd49967659ebec Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Thu, 21 Dec 2023 11:46:11 -0500 Subject: [PATCH 20/33] Fix bug in henorm and synorm for non-uniform tile sizes --- src/internal/internal_henorm.cc | 25 +++++++++++++------------ src/internal/internal_synorm.cc | 25 +++++++++++++------------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc index 7b4eecd14..36a0fe925 100644 --- a/src/internal/internal_henorm.cc +++ b/src/internal/internal_henorm.cc @@ -196,10 +196,11 @@ void norm( // Sum tile results into local results. // Summing up local contributions only. std::fill_n(values, A.n(), 0.0); - int64_t nb0 = A.tileNb(0); - int64_t mb0 = A.tileMb(0); - // off-diagonal blocks + + jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + // off-diagonal blocks + int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); @@ -210,27 +211,27 @@ void norm( // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*i + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*i + jj ], 1, + &values[jj], 1); // row sums blas::axpy( mb, 1.0, - &tiles_sums[A.m()*j + i*nb0 ], 1, - &values[i*mb0], 1); + &tiles_sums[A.m()*j + ii ], 1, + &values[ii], 1); } + ii += A.tileMb(i); } - } - // diagonal blocks - for (int64_t j = 0; j < A.nt(); ++j) { + // diagonal blocks int64_t nb = A.tileNb(j); if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*j + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*j + jj ], 1, + &values[jj], 1); } + jj += nb; } } //--------- diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc index 9f814b898..9a0b900cc 100644 --- a/src/internal/internal_synorm.cc +++ b/src/internal/internal_synorm.cc @@ -195,10 +195,11 @@ void norm( // Sum tile results into local results. // Summing up local contributions only. std::fill_n(values, A.n(), 0.0); - int64_t nb0 = A.tileNb(0); - int64_t mb0 = A.tileMb(0); - // off-diagonal blocks + + jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + // off-diagonal blocks + int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); @@ -209,27 +210,27 @@ void norm( // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*i + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*i + jj ], 1, + &values[jj], 1); // row sums blas::axpy( mb, 1.0, - &tiles_sums[A.m()*j + i*nb0 ], 1, - &values[i*mb0], 1); + &tiles_sums[A.m()*j + ii ], 1, + &values[ii], 1); } + ii += A.tileMb(i); } - } - // diagonal blocks - for (int64_t j = 0; j < A.nt(); ++j) { + // diagonal blocks int64_t nb = A.tileNb(j); if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*j + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*j + jj ], 1, + &values[jj], 1); } + jj += nb; } } //--------- From 5be01cfeac1852a5efb149ecb26b794149c610af Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Thu, 21 Dec 2023 11:46:42 -0500 Subject: [PATCH 21/33] Reduce code duplication in matrix_iterator --- test/test_utils.hh | 111 ++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 83 deletions(-) diff --git a/test/test_utils.hh b/test/test_utils.hh index f04ecb4b8..b4fbba6db 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -73,13 +73,18 @@ void matrix_iterator( int64_t B_mt = B.mt(); int64_t B_nt = B.nt(); - if constexpr (std::is_same>::value) { - int64_t A_j = 0, A_jj = 0; - for (int64_t B_j = 0; B_j < B_nt; ++B_j) { + constexpr bool is_general = std::is_same_v>; + assert( is_general == (A.uplo() == slate::Uplo::General) ); + bool is_upper = (A.uplo() == slate::Uplo::Upper); - int64_t A_i = 0, A_ii = 0; - for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + int64_t A_j = 0, A_jj = 0; + for (int64_t B_j = 0; B_j < B_nt; ++B_j) { + + int64_t A_i = 0, A_ii = 0; + for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + + if (is_general || (is_upper ? (B_i <= B_j) : (B_i >= B_j))) { #pragma omp task shared(A, B) \ firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) { @@ -101,7 +106,14 @@ void matrix_iterator( scalar_t const* TA_data = TA.data(); scalar_t* TB_data = TB.data(); for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii < mb; ++ii) { + int64_t ii_start = 0, ii_end = mb; + if (!is_general && B_i == B_j) { // diagonal tile + if (is_upper) + ii_end = std::min(jj+1, mb); + else + ii_start = jj; + } + for (int64_t ii = ii_start; ii < ii_end; ++ii) { thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ], TB_data[ ii + jj*ldb ] ); } @@ -112,86 +124,19 @@ void matrix_iterator( } } - A_ii += B.tileMb( B_i ); - assert( A_ii <= A.tileMb( A_i ) ); - if (A_ii == A.tileMb( A_i )) { - ++A_i; - A_ii = 0; - } } - A_jj += B.tileNb( B_j ); - assert( A_jj <= A.tileNb( A_j ) ); - if (A_jj == A.tileNb( A_j )) { - ++A_j; - A_jj = 0; + A_ii += B.tileMb( B_i ); + assert( A_ii <= A.tileMb( A_i ) ); + if (A_ii == A.tileMb( A_i )) { + ++A_i; + A_ii = 0; } } - } - else { - bool is_upper = (A.uplo() == slate::Uplo::Upper); - - int64_t A_j = 0, A_jj = 0; - for (int64_t B_j = 0; B_j < B_nt; ++B_j) { - - int64_t A_i = 0, A_ii = 0; - for (int64_t B_i = 0; B_i < B_mt; ++B_i) { - - bool right_uplo = is_upper ? (B_i <= B_j) : (B_i >= B_j); - if (right_uplo) { - - #pragma omp task shared(A, B) firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) - { - int tag = A_i + A_j * A.mt(); - if (B.tileIsLocal( B_i, B_j )) { - A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), - slate::Layout::ColMajor, tag ); - - A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor ); - B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor ); - auto TA = A( A_i, A_j ); - auto TB = B( B_i, B_j ); - int64_t mb = TB.mb(); - int64_t nb = TB.nb(); - assert( A_ii + mb <= TA.mb() ); - assert( A_jj + nb <= TA.nb() ); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - - for (int64_t jj = 0; jj < nb; ++jj) { - int64_t ii_start = 0, ii_end = mb; - if (B_i == B_j) { // diagonal tile - if (is_upper) - ii_end = std::min(jj+1, mb); - else - ii_start = jj; - } - for (int64_t ii = ii_start; ii < ii_end; ++ii) { - thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ], - TB_data[ ii + jj*ldb ] ); - } - } - } - else if (A.tileIsLocal( A_i, A_j )) { - A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag ); - } - } - } - - A_ii += B.tileMb( B_i ); - assert( A_ii <= A.tileMb( A_i ) ); - if (A_ii == A.tileMb( A_i )) { - ++A_i; - A_ii = 0; - } - } - A_jj += B.tileNb( B_j ); - assert( A_jj <= A.tileNb( A_j ) ); - if (A_jj == A.tileNb( A_j )) { - ++A_j; - A_jj = 0; - } + A_jj += B.tileNb( B_j ); + assert( A_jj <= A.tileNb( A_j ) ); + if (A_jj == A.tileNb( A_j )) { + ++A_j; + A_jj = 0; } } From 31d84c3db165a049a59df693c267ad5cf634c2fc Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 26 Dec 2023 16:18:56 -0500 Subject: [PATCH 22/33] Improve some warning messages --- test/test_utils.hh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/test_utils.hh b/test/test_utils.hh index b4fbba6db..ab1611665 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -26,19 +26,25 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal } if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: dev_dist = Col tile not supported with ScaLAPACK"; + params.msg() = "skipping: dev_dist = Col tile not supported with origin=ScaLAPACK"; return true; } if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; + params.msg() = "skipping: nonuniform tile not supported with origin=ScaLAPACK"; return true; } #ifdef SLATE_HAVE_SCALAPACK - if (!keep_nonuniform_ref && nonuniform_nb && params.ref()) { + if (!keep_nonuniform_ref && nonuniform_nb && params.ref() != 'n') { params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - params.ref() = false; + if (params.ref() == 'o') { + // If ref=='o', the user doesn't want to run SLATE version + return true; + } + else { + params.ref() = 'n'; + } } #else // Can only run ref when we have ScaLAPACK From 8e00a465f840025cd9f9bdd00b75333b2d57a14b Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 27 Dec 2023 14:10:10 -0500 Subject: [PATCH 23/33] Delete unused header --- test/scalapack_support_routines.hh | 191 ----------------------------- test/test_add.cc | 1 - test/test_bdsqr.cc | 1 - test/test_copy.cc | 1 - test/test_gbnorm.cc | 1 - test/test_gecondest.cc | 1 - test/test_gelqf.cc | 1 - test/test_gels.cc | 1 - test/test_gemm.cc | 1 - test/test_geqrf.cc | 1 - test/test_gesv.cc | 1 - test/test_getri.cc | 1 - test/test_hb2st.cc | 1 - test/test_hbnorm.cc | 1 - test/test_heev.cc | 1 - test/test_hegst.cc | 1 - test/test_hegv.cc | 1 - test/test_hesv.cc | 1 - test/test_pocondest.cc | 1 - test/test_potri.cc | 1 - test/test_scale_row_col.cc | 1 - test/test_set.cc | 1 - test/test_stedc.cc | 1 - test/test_stedc_deflate.cc | 1 - test/test_stedc_secular.cc | 1 - test/test_stedc_sort.cc | 1 - test/test_stedc_z_vector.cc | 1 - test/test_steqr2.cc | 1 - test/test_sterf.cc | 2 +- test/test_tb2bd.cc | 1 - test/test_tbsm.cc | 1 - test/test_trcondest.cc | 1 - test/test_trtri.cc | 1 - test/test_unmqr.cc | 1 - 34 files changed, 1 insertion(+), 224 deletions(-) delete mode 100644 test/scalapack_support_routines.hh diff --git a/test/scalapack_support_routines.hh b/test/scalapack_support_routines.hh deleted file mode 100644 index 458dc7e42..000000000 --- a/test/scalapack_support_routines.hh +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright (c) 2009-2023, University of Tennessee. All rights reserved. -// Copyright (c) 2010, University of Denver, Colorado. -// SPDX-License-Identifier: BSD-3-Clause -// This program is free software: you can redistribute it and/or modify it under -// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. - -#ifndef SLATE_SCALAPACK_SUPPORT_HH -#define SLATE_SCALAPACK_SUPPORT_HH - -#include - -#include "scalapack_wrappers.hh" - -//------------------------------------------------------------------------------ -// Matrix generation -#define Rnd64_A 6364136223846793005ULL -#define Rnd64_C 1ULL -#define RndF_Mul 5.4210108624275222e-20f -#define RndD_Mul 5.4210108624275222e-20 - -typedef unsigned long long ull; - -static inline ull Rnd64_jump(ull n, ull seed) -{ - ull a_k, c_k, ran; - int64_t i; - - a_k = Rnd64_A; - c_k = Rnd64_C; - ran = seed; - for (i = 0; n; n >>= 1, ++i) { - if (n & 1) - ran = a_k*ran + c_k; - c_k *= (a_k + 1); - a_k *= a_k; - } - return ran; -} - -template -static inline void CORE_plrnt(int64_t m, int64_t n, scalar_t* A, int64_t lda, - int64_t bigM, int64_t m0, int64_t n0, ull seed) -{ - scalar_t* tmp = A; - int64_t i, j; - ull ran, jump; - - jump = (ull)m0 + (ull)n0*(ull)bigM; - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, (ull)seed); - for (i = 0; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += lda - i; - jump += bigM; - } -} - -template -static inline void CORE_plghe(scalar_t bump, int64_t m, int64_t n, scalar_t* A, int64_t lda, - int64_t gM, int64_t m0, int64_t n0, ull seed) -{ - scalar_t* tmp = A; - int64_t i, j; - ull ran, jump; - - jump = (ull)m0 + (ull)n0*(ull)gM; - /* Tile diagonal */ - if (m0 == n0) { - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, seed); - - for (i = j; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += (lda - i + j + 1); - jump += gM + 1; - } - for (j = 0; j < n; ++j) { - A[j + j*lda] += bump; - - for (i = 0; i < j; ++i) - A[lda*j + i] = A[lda*i + j]; - } - } - - /* Lower part */ - else if (m0 > n0) { - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, seed); - - for (i = 0; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += (lda - i); - jump += gM; - } - } - - /* Upper part */ - else if (m0 < n0) { - /* Overwrite jump */ - jump = (ull)n0 + (ull)m0*(ull)gM; - - for (i = 0; i < m; ++i) { - ran = Rnd64_jump(jump, seed); - - for (j = 0; j < n; ++j) { - A[j*lda + i] = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - } - jump += gM; - } - } -} - -template -static void scalapack_pplrnt(scalar_t* A, - int64_t m, int64_t n, - int64_t mb, int64_t nb, - blas_int myrow, blas_int mycol, - blas_int nprow, blas_int npcol, - int64_t lldA, - int64_t seed) -{ - blas_int idum1, idum2, iloc, jloc, i0 = 0; - blas_int tempm, tempn; - scalar_t* Ab; - blas_int mb_ = blas_int( mb ); - blas_int nb_ = blas_int( nb ); - - // #pragma omp parallel for - for (blas_int i = 1; i <= m; i += mb) { - for (blas_int j = 1; j <= n; j += nb) { - if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) && - (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) { - iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow); - jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol); - - Ab = &A[(jloc - 1)*lldA + (iloc - 1) ]; - tempm = (m - i + 1) > mb ? mb : (m - i + 1); - tempn = (n - j + 1) > nb ? nb : (n - j + 1); - CORE_plrnt(tempm, tempn, Ab, lldA, - m, (i - 1), (j - 1), seed); - } - } - } -} - - -template -static void scalapack_pplghe(scalar_t* A, - int64_t m, int64_t n, - int64_t mb, int64_t nb, - blas_int myrow, blas_int mycol, - blas_int nprow, blas_int npcol, - int64_t lldA, - int64_t seed) -{ - blas_int idum1, idum2, iloc, jloc, i0 = 0; - int64_t tempm, tempn; - scalar_t* Ab; - scalar_t bump = (scalar_t)m; - blas_int mb_ = blas_int( mb ); - blas_int nb_ = blas_int( nb ); - - // #pragma omp parallel for - for (blas_int i = 1; i <= m; i += mb) { - for (blas_int j = 1; j <= n; j += nb) { - if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) && - (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) { - iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow); - jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol); - - Ab = &A[(jloc - 1)*lldA + (iloc - 1) ]; - tempm = (m - i + 1) > mb ? mb : (m - i + 1); - tempn = (n - j + 1) > nb ? nb : (n - j + 1); - CORE_plghe(bump, tempm, tempn, Ab, lldA, - m, (i - 1), (j - 1), seed); - } - } - } -} - -#endif // SLATE_SCALAPACK_SUPPORT_HH diff --git a/test/test_add.cc b/test/test_add.cc index b6fa65700..0d56f8268 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" diff --git a/test/test_bdsqr.cc b/test/test_bdsqr.cc index 8e4ac041f..b3ffa96f0 100644 --- a/test/test_bdsqr.cc +++ b/test/test_bdsqr.cc @@ -7,7 +7,6 @@ #include "blas.hh" #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "internal/internal.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_copy.cc b/test/test_copy.cc index f8aa9bd1c..1e8dfb99a 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" diff --git a/test/test_gbnorm.cc b/test/test_gbnorm.cc index 883d07aff..7ac9a510d 100644 --- a/test/test_gbnorm.cc +++ b/test/test_gbnorm.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "print_matrix.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_gecondest.cc b/test/test_gecondest.cc index 9bb9badad..8f365bee8 100644 --- a/test/test_gecondest.cc +++ b/test/test_gecondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gelqf.cc b/test/test_gelqf.cc index 915d67455..68a84747c 100644 --- a/test/test_gelqf.cc +++ b/test/test_gelqf.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gels.cc b/test/test_gels.cc index 9ad6a3bbc..2d20e213a 100644 --- a/test/test_gels.cc +++ b/test/test_gels.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gemm.cc b/test/test_gemm.cc index c20db8ea0..8add9e9c5 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -13,7 +13,6 @@ #include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_geqrf.cc b/test/test_geqrf.cc index 6ae7d8302..18c8bf114 100644 --- a/test/test_geqrf.cc +++ b/test/test_geqrf.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gesv.cc b/test/test_gesv.cc index 422dc686a..17c7672f8 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -14,7 +14,6 @@ #include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_getri.cc b/test/test_getri.cc index 6a0343e52..3693dcba1 100644 --- a/test/test_getri.cc +++ b/test/test_getri.cc @@ -10,7 +10,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hb2st.cc b/test/test_hb2st.cc index 5c3f4bce7..607250fe1 100644 --- a/test/test_hb2st.cc +++ b/test/test_hb2st.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" #include "grid_utils.hh" -#include "scalapack_support_routines.hh" #include #include diff --git a/test/test_hbnorm.cc b/test/test_hbnorm.cc index c62de133a..bf5c5cd2c 100644 --- a/test/test_hbnorm.cc +++ b/test/test_hbnorm.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" #include "band_utils.hh" diff --git a/test/test_heev.cc b/test/test_heev.cc index 505c1c0f3..fc7077f2a 100644 --- a/test/test_heev.cc +++ b/test/test_heev.cc @@ -9,7 +9,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hegst.cc b/test/test_hegst.cc index 93d78d275..35574087d 100644 --- a/test/test_hegst.cc +++ b/test/test_hegst.cc @@ -10,7 +10,6 @@ #include "print_matrix.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "auxiliary/Debug.hh" #include "grid_utils.hh" diff --git a/test/test_hegv.cc b/test/test_hegv.cc index d342be443..f6061b0a1 100644 --- a/test/test_hegv.cc +++ b/test/test_hegv.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hesv.cc b/test/test_hesv.cc index 2e8f54af1..8dbe4b83c 100644 --- a/test/test_hesv.cc +++ b/test/test_hesv.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "grid_utils.hh" #include "print_matrix.hh" diff --git a/test/test_pocondest.cc b/test/test_pocondest.cc index 47d45109e..081e03e29 100644 --- a/test/test_pocondest.cc +++ b/test/test_pocondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_potri.cc b/test/test_potri.cc index 647181acd..07d926ffc 100644 --- a/test/test_potri.cc +++ b/test/test_potri.cc @@ -10,7 +10,6 @@ #include "print_matrix.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "grid_utils.hh" diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index e239beb0d..e538476ee 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" diff --git a/test/test_set.cc b/test/test_set.cc index 5e1fdf024..9b91e7583 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" diff --git a/test/test_stedc.cc b/test/test_stedc.cc index b28271c10..73ff18ec9 100644 --- a/test/test_stedc.cc +++ b/test/test_stedc.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" #include "matrix_generator.hh" diff --git a/test/test_stedc_deflate.cc b/test/test_stedc_deflate.cc index 9ad2dedd8..077710af2 100644 --- a/test/test_stedc_deflate.cc +++ b/test/test_stedc_deflate.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_secular.cc b/test/test_stedc_secular.cc index 2e5aeabaf..e321bb8cd 100644 --- a/test/test_stedc_secular.cc +++ b/test/test_stedc_secular.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_sort.cc b/test/test_stedc_sort.cc index 96aa1d451..26eff3fd7 100644 --- a/test/test_stedc_sort.cc +++ b/test/test_stedc_sort.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_z_vector.cc b/test/test_stedc_z_vector.cc index bc8a5013f..64ca668a2 100644 --- a/test/test_stedc_z_vector.cc +++ b/test/test_stedc_z_vector.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_steqr2.cc b/test/test_steqr2.cc index 4d092669e..ff5ec29b9 100644 --- a/test/test_steqr2.cc +++ b/test/test_steqr2.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_sterf.cc b/test/test_sterf.cc index 1d0a7f2a5..5488ef7fe 100644 --- a/test/test_sterf.cc +++ b/test/test_sterf.cc @@ -7,7 +7,7 @@ #include "blas.hh" #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" + #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_tb2bd.cc b/test/test_tb2bd.cc index a1ab1aafa..29204bda2 100644 --- a/test/test_tb2bd.cc +++ b/test/test_tb2bd.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" #include "grid_utils.hh" -#include "scalapack_support_routines.hh" #include #include diff --git a/test/test_tbsm.cc b/test/test_tbsm.cc index 2442245b4..b7204bb90 100644 --- a/test/test_tbsm.cc +++ b/test/test_tbsm.cc @@ -8,7 +8,6 @@ #include "blas/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "print_matrix.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_trcondest.cc b/test/test_trcondest.cc index 74fbac410..1301854fb 100644 --- a/test/test_trcondest.cc +++ b/test/test_trcondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_trtri.cc b/test/test_trtri.cc index 2cd7f6daf..e0c0d9435 100644 --- a/test/test_trtri.cc +++ b/test/test_trtri.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" #include "grid_utils.hh" diff --git a/test/test_unmqr.cc b/test/test_unmqr.cc index d7270b303..87eff3ecf 100644 --- a/test/test_unmqr.cc +++ b/test/test_unmqr.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include From faddc11a772bed6879bdfdfcd9b3af4cdde18437 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 27 Dec 2023 14:22:35 -0500 Subject: [PATCH 24/33] Cleanup some header include's --- test/test_add.cc | 1 - test/test_copy.cc | 1 - test/test_genorm.cc | 1 - test/test_gesv.cc | 1 - test/test_hemm.cc | 1 - test/test_henorm.cc | 1 - test/test_her2k.cc | 1 - test/test_herk.cc | 1 - test/test_posv.cc | 1 - test/test_scale.cc | 1 - test/test_scale_row_col.cc | 1 - test/test_set.cc | 1 - test/test_svd.cc | 1 - test/test_symm.cc | 1 - test/test_synorm.cc | 1 - test/test_syr2k.cc | 1 - test/test_syrk.cc | 1 - test/test_trmm.cc | 1 - test/test_trnorm.cc | 1 - test/test_trsm.cc | 1 - test/test_utils.hh | 1 + 21 files changed, 1 insertion(+), 20 deletions(-) diff --git a/test/test_add.cc b/test/test_add.cc index 0d56f8268..0596c73f8 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_copy.cc b/test/test_copy.cc index 1e8dfb99a..d7b3bf0fc 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_genorm.cc b/test/test_genorm.cc index 31f7683df..41c03cee5 100644 --- a/test/test_genorm.cc +++ b/test/test_genorm.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_gesv.cc b/test/test_gesv.cc index 17c7672f8..e8d7b96e5 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_hemm.cc b/test/test_hemm.cc index a31612b53..d874c40ac 100644 --- a/test/test_hemm.cc +++ b/test/test_hemm.cc @@ -8,7 +8,6 @@ #include "blas/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_henorm.cc b/test/test_henorm.cc index d18a38792..e03a6ac30 100644 --- a/test/test_henorm.cc +++ b/test/test_henorm.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_her2k.cc b/test/test_her2k.cc index fc8d4077b..7fe3ed067 100644 --- a/test/test_her2k.cc +++ b/test/test_her2k.cc @@ -8,7 +8,6 @@ #include "print_matrix.hh" #include "blas/flops.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_herk.cc b/test/test_herk.cc index d3f9144ba..2bbef4b7a 100644 --- a/test/test_herk.cc +++ b/test/test_herk.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "blas/flops.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_posv.cc b/test/test_posv.cc index 161413a70..8b2d6ef69 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_scale.cc b/test/test_scale.cc index 0d05b056d..c04652e39 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index e538476ee..18e0819fd 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_set.cc b/test/test_set.cc index 9b91e7583..85eed4200 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_svd.cc b/test/test_svd.cc index 7189293a1..36c4e9ba3 100644 --- a/test/test_svd.cc +++ b/test/test_svd.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_symm.cc b/test/test_symm.cc index e463b5d34..8d6ebed19 100644 --- a/test/test_symm.cc +++ b/test/test_symm.cc @@ -8,7 +8,6 @@ #include "blas/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_synorm.cc b/test/test_synorm.cc index 1145006fe..fa8d3b250 100644 --- a/test/test_synorm.cc +++ b/test/test_synorm.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc index a42aaf6c9..cf051cabd 100644 --- a/test/test_syr2k.cc +++ b/test/test_syr2k.cc @@ -8,7 +8,6 @@ #include "print_matrix.hh" #include "blas/flops.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_syrk.cc b/test/test_syrk.cc index bc8e6e1d0..bae13cf10 100644 --- a/test/test_syrk.cc +++ b/test/test_syrk.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "blas/flops.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_trmm.cc b/test/test_trmm.cc index 941ee2ecb..a8b6d08e6 100644 --- a/test/test_trmm.cc +++ b/test/test_trmm.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "blas/flops.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc index 4cd1a4cbc..2f650fdeb 100644 --- a/test/test_trnorm.cc +++ b/test/test_trnorm.cc @@ -10,7 +10,6 @@ #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_trsm.cc b/test/test_trsm.cc index 5f4bd7045..d1b8affe4 100644 --- a/test/test_trsm.cc +++ b/test/test_trsm.cc @@ -8,7 +8,6 @@ #include "blas/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" #include "matrix_utils.hh" #include "test_utils.hh" diff --git a/test/test_utils.hh b/test/test_utils.hh index ab1611665..6c25788d3 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -7,6 +7,7 @@ #define SLATE_TEST_UTILS_HH #include "slate/slate.hh" +#include "test.hh" ///----------------------------------------------------------------------------- /// Checks for common invalid parameter combinations From 3d3d1528e531729d3d3c1ede91ef078532ff47a3 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 27 Dec 2023 14:51:43 -0500 Subject: [PATCH 25/33] Make norm testers more consistent --- test/test_henorm.cc | 256 +++++++++++++++++++--------------------- test/test_synorm.cc | 256 +++++++++++++++++++--------------------- test/test_trnorm.cc | 276 +++++++++++++++++++++----------------------- 3 files changed, 373 insertions(+), 415 deletions(-) diff --git a/test/test_henorm.cc b/test/test_henorm.cc index e03a6ac30..a9a24a9fe 100644 --- a/test/test_henorm.cc +++ b/test/test_henorm.cc @@ -97,24 +97,22 @@ void test_henorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // comparison with reference routine from ScaLAPACK - - // initialize BLACS and ScaLAPACK - blas_int ictxt, A_desc[9]; - A_alloc.create_ScaLAPACK_context( &ictxt ); - A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK + // comparison with reference routine from ScaLAPACK - auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); - if (origin != slate::Origin::ScaLAPACK && !ref_copy) { - A_data.resize( A_alloc.lld * A_alloc.nloc ); + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; - copy(A, &A_data[0], A_desc); - } + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); - if (check || ref) { - // comparison with reference routine from ScaLAPACK + copy(A, &A_data[0], A_desc); + } // allocate work space int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), @@ -122,6 +120,8 @@ void test_henorm_work(Params& params, bool run) int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; std::vector worklanhe( lwork ); + // comparison with reference routine from ScaLAPACK + //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -158,145 +158,131 @@ void test_henorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - if (grid_order != slate::GridOrder::Col) { - printf("WARNING: cannot do extended tests with row-major grid\n"); - } - else { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; - std::vector worklanhe(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*A_alloc.lld ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - real_t A_norm_ref = scalapack_planhe( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } + A_norm_ref = scalapack_planhe( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (A.mpiRank() == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*A_alloc.lld ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } } } } - } - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if ((check || ref) && A.mpiRank() == 0) - printf( "ScaLAPACK not available\n" ); - #endif + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- diff --git a/test/test_synorm.cc b/test/test_synorm.cc index fa8d3b250..eb3dfc025 100644 --- a/test/test_synorm.cc +++ b/test/test_synorm.cc @@ -95,30 +95,28 @@ void test_synorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // comparison with reference routine from ScaLAPACK - - // initialize BLACS and ScaLAPACK - blas_int ictxt, A_desc[9]; - A_alloc.create_ScaLAPACK_context( &ictxt ); - A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK + // comparison with reference routine from ScaLAPACK - auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); - if (origin != slate::Origin::ScaLAPACK && !ref_copy) { - A_data.resize( A_alloc.lld * A_alloc.nloc ); + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; - copy(A, &A_data[0], A_desc); - } + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); - if (check || ref) { - // comparison with reference routine from ScaLAPACK + copy(A, &A_data[0], A_desc); + } // allocate work space int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), scalapack_ilcm( p, q ) / p ); int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; - std::vector worklansy(lwork); + std::vector worklansy( lwork ); //================================================== // Run ScaLAPACK reference routine. @@ -156,145 +154,131 @@ void test_synorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - if (grid_order != slate::GridOrder::Col) { - printf("WARNING: cannot do extended tests with row-major grid\n"); - } - else { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; - std::vector worklansy(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*A_alloc.lld ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - real_t A_norm_ref = scalapack_plansy( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklansy[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } + A_norm_ref = scalapack_plansy( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklansy[0]); - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (A.mpiRank() == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real( peak ), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real( peak ), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*A_alloc.lld ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } } } } - } - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if ((check || ref) && A.mpiRank() == 0) - printf( "ScaLAPACK not available\n" ); - #endif + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc index 2f650fdeb..bd8d87f67 100644 --- a/test/test_trnorm.cc +++ b/test/test_trnorm.cc @@ -100,28 +100,27 @@ void test_trnorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // comparison with reference routine from ScaLAPACK - - // initialize BLACS and ScaLAPACK - blas_int ictxt, A_desc[9]; - A_alloc.create_ScaLAPACK_context( &ictxt ); - A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK + // comparison with reference routine from ScaLAPACK - auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); - if (origin != slate::Origin::ScaLAPACK && !ref_copy) { - A_data.resize( A_alloc.lld * A_alloc.nloc ); + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; - copy(A, &A_data[0], A_desc); - } + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); - // TODO move the above into this if statement - if (check || ref) { - // comparison with reference routine from ScaLAPACK + copy(A, &A_data[0], A_desc); + } // allocate work space - std::vector worklantr(std::max(A_alloc.mloc, A_alloc.nloc)); + std::vector worklantr( std::max(A_alloc.mloc, A_alloc.nloc) ); + + // comparison with reference routine from ScaLAPACK //================================================== // Run ScaLAPACK reference routine. @@ -162,152 +161,141 @@ void test_trnorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - if (grid_order != slate::GridOrder::Col) { - printf("WARNING: cannot do extended tests with row-major grid\n"); - } - else { - // allocate work space - std::vector worklantr(std::max(A_alloc.mloc, A_alloc.nloc)); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if mt or nt is small, so check in loops. - int64_t mt = A.mt(); - int64_t nt = A.nt(); - std::set i_indices = { 0, 1, mt - 2, mt - 1 }; - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - i_indices.insert(rand() % mt); - j_indices.insert(rand() % nt); + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : i_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if mt or nt is small, so check in loops. + int64_t mt = A.mt(); + int64_t nt = A.nt(); + std::set i_indices = { 0, 1, mt - 2, mt - 1 }; + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + i_indices.insert(rand() % mt); + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(m - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : i_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) + continue; + int64_t ib = std::min(m - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*A_alloc.lld ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - real_t A_norm_ref = scalapack_plantr( - norm2str(norm), uplo2str(A.uplo()), diag2str(diag), - m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One) { - error /= sqrt(m); - } - else if (norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(m*n); - } + A_norm_ref = scalapack_plantr( + norm2str(norm), uplo2str(A.uplo()), diag2str(diag), + m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (A.mpiRank() == 0) { - // if peak is nan, expect A_norm to be nan, - // except in Unit case with i == j and ii == jj, - // where peak shouldn't affect A_norm. - bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One) { + error /= sqrt(m); + } + else if (norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(m*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan, + // except in Unit case with i == j and ii == jj, + // where peak shouldn't affect A_norm. + bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*A_alloc.lld ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } } } } - } - - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if (A.mpiRank() == 0) - printf( "ScaLAPACK not available\n" ); - #endif + + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- From 778a8660811d0e68b11efd903e1caaa95b1284b4 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 27 Dec 2023 14:55:37 -0500 Subject: [PATCH 26/33] Only do parameter check when doing actual test run --- test/test_gesv.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_gesv.cc b/test/test_gesv.cc index e8d7b96e5..1e7b00541 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -148,14 +148,14 @@ void test_gesv_work(Params& params, bool run) depth = params.depth(); } + if (! run) + return; + // Check for common invalid combinations if (is_invalid_parameters( params )) { return; } - if (! run) - return; - if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres") && ! std::is_same::value) { params.msg() = "skipping: unsupported mixed precision; must be type=d or z"; From ce5359148450934772829af9c3eb053c5e675c8c Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Tue, 23 Jan 2024 14:24:49 -0500 Subject: [PATCH 27/33] style --- test/test_utils.hh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_utils.hh b/test/test_utils.hh index 6c25788d3..df336b077 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -9,7 +9,7 @@ #include "slate/slate.hh" #include "test.hh" -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// Checks for common invalid parameter combinations /// /// @return true if the configuration should be skipped @@ -59,7 +59,7 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal return false; } -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// Applies the operator thunk to each element of A and B to update B. /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the @@ -150,7 +150,7 @@ void matrix_iterator( A.releaseRemoteWorkspace(); } -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// subtract_matrices takes input matrices A and B, and performs B = B - A. /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the @@ -164,7 +164,7 @@ void subtract_matrices( matrix_type& A, matrix_type& B ) matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } ); } -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// copy_matrix copies A to B /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the From 3bc5090a14644a34b87dac68af0c4737fe540023 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Tue, 23 Jan 2024 14:25:13 -0500 Subject: [PATCH 28/33] test: add trans back to getrs_tntpiv, nopiv --- test/run_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index d0ce7aace..34330b4e9 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -405,8 +405,8 @@ def filter_csv( values, csv ): + ' --matrix rand_dominant' ], [ 'getrs', gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ], - [ 'getrs_tntpiv', gen + dtype + la + n + ge_matrix ], - [ 'getrs_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + [ 'getrs_tntpiv', gen + dtype + la + n + trans + ge_matrix ], + [ 'getrs_nopiv', gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], [ 'getri', gen + dtype + la + n ], From 8d65640c0b89de8f421090d787bd508b20a07f52 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Wed, 24 Jan 2024 18:12:33 -0500 Subject: [PATCH 29/33] factor nb out of {sy,he}norm loops; make {sy,he}norms more consistent --- src/internal/internal_henorm.cc | 6 +++--- src/internal/internal_synorm.cc | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc index 36a0fe925..9f6cf0907 100644 --- a/src/internal/internal_henorm.cc +++ b/src/internal/internal_henorm.cc @@ -199,10 +199,11 @@ void norm( jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + int64_t nb = A.tileNb( j ); + // off-diagonal blocks int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { - int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); if (A.tileIsLocal(i, j) && ( ( lower && i > j) || @@ -219,11 +220,10 @@ void norm( &tiles_sums[A.m()*j + ii ], 1, &values[ii], 1); } - ii += A.tileMb(i); + ii += mb; } // diagonal blocks - int64_t nb = A.tileNb(j); if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc index 9a0b900cc..0ae70972d 100644 --- a/src/internal/internal_synorm.cc +++ b/src/internal/internal_synorm.cc @@ -143,7 +143,7 @@ void norm( int64_t jj = 0; #pragma omp taskgroup for (int64_t j = 0; j < A.nt(); ++j) { - // diagonal tile + // diagonal tiles if (j < A.mt() && A.tileIsLocal(j, j)) { #pragma omp task slate_omp_default_none \ shared( A, tiles_sums ) \ @@ -190,7 +190,7 @@ void norm( } jj += A.tileNb(j); } - //end omp taskgroup + // end omp taskgroup // Sum tile results into local results. // Summing up local contributions only. @@ -198,10 +198,11 @@ void norm( jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + int64_t nb = A.tileNb( j ); + // off-diagonal blocks int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { - int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); if (A.tileIsLocal(i, j) && ( ( lower && i > j) || @@ -218,11 +219,10 @@ void norm( &tiles_sums[A.m()*j + ii ], 1, &values[ii], 1); } - ii += A.tileMb(i); + ii += mb; } // diagonal blocks - int64_t nb = A.tileNb(j); if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( @@ -302,6 +302,7 @@ void norm( } } } + // end omp taskgroup } } @@ -326,7 +327,8 @@ void norm( /// @ingroup norm_internal /// template -void norm(internal::TargetType, +void norm( + internal::TargetType, Norm in_norm, NormScope scope, SymmetricMatrix& A, blas::real_type* values, int priority, int queue_index) From 60041846970eafdea490e342b46f5db9c8d53d6b Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Wed, 31 Jan 2024 15:58:11 -0500 Subject: [PATCH 30/33] remove unused vals_dev_arrays --- src/internal/internal_henorm.cc | 4 +--- src/internal/internal_synorm.cc | 9 +-------- src/internal/internal_trnorm.cc | 4 +--- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc index 9f6cf0907..992287b73 100644 --- a/src/internal/internal_henorm.cc +++ b/src/internal/internal_henorm.cc @@ -346,8 +346,6 @@ void norm( std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -375,7 +373,7 @@ void norm( for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, ijrange ) \ + shared( vals_host_arrays, ijrange ) \ firstprivate(device, layout, lower, queue_index, in_norm, ldv) \ priority(priority) { diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc index 0ae70972d..79a5ba291 100644 --- a/src/internal/internal_synorm.cc +++ b/src/internal/internal_synorm.cc @@ -350,8 +350,6 @@ void norm( std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -379,7 +377,7 @@ void norm( for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, ijrange ) \ + shared( vals_host_arrays, ijrange ) \ firstprivate(device, lower, queue_index, in_norm, ldv, layout) \ priority(priority) { @@ -496,11 +494,6 @@ void norm( } // end omp taskgroup - for (int device = 0; device < A.num_devices(); ++device) { - blas::Queue* queue = A.compute_queue(device, queue_index); - blas::device_free(vals_dev_arrays[device], *queue); - } - // Reduction over devices to local result. if (in_norm == Norm::Max) { *values = lapack::lange(in_norm, diff --git a/src/internal/internal_trnorm.cc b/src/internal/internal_trnorm.cc index 72e203c44..83cf429ae 100644 --- a/src/internal/internal_trnorm.cc +++ b/src/internal/internal_trnorm.cc @@ -365,8 +365,6 @@ void norm( std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -399,7 +397,7 @@ void norm( for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, irange, jrange ) \ + shared( vals_host_arrays, irange, jrange ) \ firstprivate(device, queue_index, in_norm, ldv, layout) \ priority(priority) { From 971dc35f325fdfd45ef1b236692694b80aa0d582 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Wed, 31 Jan 2024 16:00:41 -0500 Subject: [PATCH 31/33] use B_alloc.lld; wrap --- test/test_posv.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_posv.cc b/test/test_posv.cc index 8b2d6ef69..398ac323b 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -385,9 +385,13 @@ void test_posv_work(Params& params, bool run) if (verbose > 2) { if (origin == slate::Origin::ScaLAPACK) { - slate::Debug::diffLapackMatrices(n, n, &A_data[0], A_alloc.lld, &Aref_data[0], A_alloc.lld, nb, nb); + slate::Debug::diffLapackMatrices( + n, n, &A_data[0], A_alloc.lld, + &Aref_data[0], A_alloc.lld, nb, nb); if (params.routine != "potrf") { - slate::Debug::diffLapackMatrices(n, nrhs, &B_data[0], A_alloc.lld, &Bref_data[0], A_alloc.lld, nb, nb); + slate::Debug::diffLapackMatrices( + n, nrhs, &B_data[0], B_alloc.lld, + &Bref_data[0], B_alloc.lld, nb, nb); } } } From 564f45710a49ecd962f9e9b97a95aa4e5066470c Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Wed, 31 Jan 2024 16:05:26 -0500 Subject: [PATCH 32/33] use params.matrixC --- test/test_herk.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_herk.cc b/test/test_herk.cc index 2bbef4b7a..2c86f7500 100644 --- a/test/test_herk.cc +++ b/test/test_herk.cc @@ -45,7 +45,7 @@ void test_herk_work(Params& params, bool run) slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); - params.matrixB.mark(); + params.matrixC.mark(); mark_params_for_test_HermitianMatrix( params ); mark_params_for_test_Matrix( params ); @@ -91,7 +91,7 @@ void test_herk_work(Params& params, bool run) auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); - slate::generate_matrix( params.matrixB, C ); + slate::generate_matrix( params.matrixC, C ); // If reference run is required, record norms to be used in the check/ref. real_t A_norm=0, C_orig_norm=0; From d1eca32c92f80df7b46546b6a2e47f8d98a3484f Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Wed, 31 Jan 2024 16:37:00 -0500 Subject: [PATCH 33/33] test: replace DevDist with GridOrder --- test/matrix_utils.cc | 9 ++++----- test/matrix_utils.hh | 2 +- test/run_tests.py | 16 ++++++++-------- test/test.cc | 5 +++-- test/test.hh | 33 +-------------------------------- test/test_utils.hh | 10 +++++----- 6 files changed, 22 insertions(+), 53 deletions(-) diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc index 0de0e57e9..c751bb302 100644 --- a/test/matrix_utils.cc +++ b/test/matrix_utils.cc @@ -21,11 +21,11 @@ static TestMatrix allocate_test_shared( // Load params variables int p = params.grid.m(); int q = params.grid.n(); - slate::Dist dev_dist = params.dev_dist(); int64_t nb = params.nb(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; slate::Origin origin = params.origin(); slate::GridOrder grid_order = params.grid_order(); + slate::GridOrder dev_order = params.dev_order(); // The object to be returned TestMatrix matrix( m, n, nb, p, q, grid_order ); @@ -48,14 +48,13 @@ static TestMatrix allocate_test_shared( auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder( dev_dist ), - p, num_devices_ ); + auto tileDevice = slate::func::device_1d_grid( dev_order, p, num_devices_ ); // Setup matrix to test SLATE with if (origin != slate::Origin::ScaLAPACK) { // SLATE allocates CPU or GPU tiles. slate::Target origin_target = origin2target( origin ); - if (nonuniform_nb || dev_dist == slate::Dist::Col) { + if (nonuniform_nb || dev_order == slate::GridOrder::Col) { matrix.A = construct_irregular( tileNb, tileRank, tileDevice ); } else { @@ -65,7 +64,7 @@ static TestMatrix allocate_test_shared( } else { assert( !nonuniform_nb ); - assert( dev_dist == slate::Dist::Row ); + assert( dev_order == slate::GridOrder::Row ); // Create SLATE matrix from the ScaLAPACK layouts matrix.A_data.resize( matrix.lld * matrix.nloc ); matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb, diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 247a8ef12..b742b7ccf 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -265,11 +265,11 @@ inline void mark_params_for_test_Matrix(Params& params) { params.grid.m(); params.grid.n(); - params.dev_dist(); params.nb(); params.nonuniform_nb(); params.origin(); params.grid_order(); + params.dev_order(); } //------------------------------------------------------------------------------ diff --git a/test/run_tests.py b/test/run_tests.py index 34330b4e9..574ecfc10 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -121,13 +121,13 @@ group_opt.add_argument( '--origin', action='store', help='default=%(default)s', default='s' ) group_opt.add_argument( '--target', action='store', help='default=%(default)s', default='t' ) group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' ) -group_opt.add_argument( '--dev-dist', action='store', help='default=%(default)s', default='c,r' ) group_opt.add_argument( '--nb', action='store', help='default=%(default)s', default='64,100' ) group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' ) group_opt.add_argument( '--nt', action='store', help='default=%(default)s', default='5,10,20' ) group_opt.add_argument( '--np', action='store', help='number of MPI processes; default=%(default)s', default='1' ) group_opt.add_argument( '--grid', action='store', help='use p-by-q MPI process grid', default='' ) -group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='r' ) +group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='' ) +group_opt.add_argument( '--dev-order', action='store', help='default=%(default)s', default='r,c' ) group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' ) group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' ) group_opt.add_argument( '--matrix', action='store', help='default=%(default)s', default='' ) @@ -298,12 +298,12 @@ origin = ' --origin ' + opts.origin if (opts.origin) else '' target = ' --target ' + opts.target if (opts.target) else '' la = ' --lookahead ' + opts.lookahead if (opts.lookahead) else '' -ddist = ' --dev-dist ' + opts.dev_dist if (opts.dev_dist) else '' nb = ' --nb ' + opts.nb if (opts.nb) else '' nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else '' nt = ' --nt ' + opts.nt if (opts.nt) else '' grid = ' --grid ' + opts.grid if (opts.grid) else '' -grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' +grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' +dev_order = ' --dev-order ' + opts.dev_order if (opts.dev_order) else '' repeat = ' --repeat ' + opts.repeat if (opts.repeat) else '' thresh = ' --thresh ' + opts.thresh if (opts.thresh) else '' matrix = ' --matrix ' + opts.matrix if (opts.matrix) else '' @@ -316,10 +316,10 @@ gen_no_nb = origin + target + grid + check + ref + tol + repeat gen_no_target = grid + check + ref + tol + repeat + nb -ge_matrix = ddist + grid_order -sy_matrix = uplo + ddist + grid_order -he_matrix = uplo + ddist + grid_order -tr_matrix = uplo + diag + ddist + grid_order +ge_matrix = grid_order + dev_order +sy_matrix = grid_order + dev_order + uplo +he_matrix = grid_order + dev_order + uplo +tr_matrix = grid_order + dev_order + uplo + diag if (opts.matrix): gen += matrix diff --git a/test/test.cc b/test/test.cc index 16ba41ecf..3ba473dc2 100644 --- a/test/test.cc +++ b/test/test.cc @@ -356,8 +356,8 @@ Params::Params(): method_lu ("lu", 5, ParamType::List, slate::MethodLU::PartialPiv, str2methodLU, methodLU2str, "PartialPiv, CALU, NoPiv"), method_trsm ("trsm", 4, ParamType::List, 0, str2methodTrsm, methodTrsm2str, "auto=auto, A=trsmA, B=trsmB"), - grid_order("go", 3, ParamType::List, slate::GridOrder::Col, str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"), - dev_dist ("dev-dist",9, ParamType::List, slate::Dist::Row, str2dist, dist2str, "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"), + grid_order("go", 3, ParamType::List, slate::GridOrder::Col, str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"), + dev_order ("do", 3, ParamType::List, slate::GridOrder::Row, str2grid_order, grid_order2str, "(do) Device grid order: c=Col, r=Row"), // name, w, type, default, char2enum, enum2char, enum2str, help layout ("layout", 6, ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"), @@ -463,6 +463,7 @@ Params::Params(): lookahead.name("la", "lookahead"); panel_threads.name("pt", "panel-threads"); grid_order.name("go", "grid-order"); + dev_order.name("do", "dev-order"); // Change name for the methods to use less space in the stdout method_cholQR.name("cholQR", "method-cholQR"); diff --git a/test/test.hh b/test/test.hh index 57c428443..8dec12510 100644 --- a/test/test.hh +++ b/test/test.hh @@ -27,11 +27,6 @@ enum class Origin : char { Devices = 'D', }; -enum class Dist : char { - Row = 'R', - Col = 'C', -}; - } // namespace slate // ----------------------------------------------------------------------------- @@ -88,7 +83,7 @@ public: testsweeper::ParamEnum< slate::Method > method_trsm; testsweeper::ParamEnum< slate::GridOrder > grid_order; - testsweeper::ParamEnum< slate::Dist > dev_dist; + testsweeper::ParamEnum< slate::GridOrder > dev_order; // ----- test matrix parameters MatrixParams matrix; @@ -284,32 +279,6 @@ void test_scale (Params& params, bool run); void test_scale_row_col(Params& params, bool run); void test_set (Params& params, bool run); -// ----------------------------------------------------------------------------- -inline slate::Dist str2dist(const char* dist) -{ - std::string distribution_ = dist; - std::transform( - distribution_.begin(), - distribution_.end(), - distribution_.begin(), ::tolower); - if (distribution_ == "row" || distribution_ == "r") - return slate::Dist::Row; - else if (distribution_ == "col" || distribution_ == "c" - || distribution_ == "column") - return slate::Dist::Col; - else - throw slate::Exception("unknown distribution"); -} - -inline const char* dist2str(slate::Dist dist) -{ - switch (dist) { - case slate::Dist::Row: return "row"; - case slate::Dist::Col: return "col"; - } - return "?"; -} - // ----------------------------------------------------------------------------- inline slate::Origin str2origin(const char* origin) { diff --git a/test/test_utils.hh b/test/test_utils.hh index df336b077..bdae1d494 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -18,16 +18,16 @@ inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = fal { slate::Origin origin = params.origin(); slate::Target target = params.target(); - slate::Dist dev_dist = params.dev_dist(); + slate::GridOrder dev_order = params.dev_order(); bool nonuniform_nb = params.nonuniform_nb() == 'y'; - if (target != slate::Target::Devices && dev_dist == slate::Dist::Col) { - params.msg() = "skipping: dev_dist = Col applies only to target devices"; + if (target != slate::Target::Devices && dev_order == slate::GridOrder::Col) { + params.msg() = "skipping: dev_order = Col applies only to target devices"; return true; } - if (dev_dist == slate::Dist::Col && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: dev_dist = Col tile not supported with origin=ScaLAPACK"; + if (dev_order == slate::GridOrder::Col && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: dev_order = Col not supported with origin=ScaLAPACK"; return true; }