From d31ad7b1801d01da498754ac28bffe1cd30b863b Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Fri, 10 Nov 2023 15:27:45 -0500 Subject: [PATCH 01/19] Add full support for nonuniform tile sizes to gesv tester --- test/matrix_utils.hh | 132 ++++++++++++++++++++++++++++++++++++++++ test/test.cc | 2 +- test/test_gesv.cc | 142 ++++++++++--------------------------------- 3 files changed, 165 insertions(+), 111 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index e9fd18412..1ff2df78d 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -203,4 +203,136 @@ matrix_type matrix_cast( return matrix_cast_impl( MatrixType< matrix_type >(), A, uplo, diag ); } + +// ----------------------------------------------------------------------------- +// Functions for allocating test matrices + +template +class TestMatrix { + using scalar_t = typename MatrixType::value_type; + +public: + // SLATE matrices + MatrixType A; + MatrixType Aref; + + // Storage for ScaLAPACK matrices + std::vector A_data; + std::vector Aref_data; + + // ScaLAPACK configuration + int64_t mloc, nloc, lld, nb; +}; + +// ----------------------------------------------------------------------------- +/// Marks the paramters used by allocate_test_Matrix +inline void mark_params_for_test_Matrix(Params& params) +{ + params.grid.m(); + params.grid.n(); + params.nb(); + params.nonuniform_nb(); + params.origin(); + params.grid_order(); +} + +// ----------------------------------------------------------------------------- +/// Allocates a Matrix and a reference version for testing. +/// +/// @param run[in] +/// Whether to actaully allocate the matrix, instead of just marking params +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, + int64_t m, + int64_t n, + Params& params) +{ + // Load params variables + int64_t p = params.grid.m(); + int64_t q = params.grid.n(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + + // The object to be returned + TestMatrix> matrix; + + // ScaLAPACK variables + int mpi_rank, myrow, mycol; + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + matrix.nb = nb; + matrix.mloc = num_local_rows_cols( m, nb, myrow, p ); + matrix.nloc = num_local_rows_cols (n, nb, mycol, q ); + matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A + + // Functions for nonuniform tile sizes + std::function< int64_t (int64_t j) > + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, + p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target( origin ); + if (nonuniform_nb) { + + matrix.A = slate::Matrix( m, n, tileNb, tileNb, tileRank, + tileDevice, MPI_COMM_WORLD); + } + else { + matrix.A = slate::Matrix( m, n, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + matrix.A.insertLocalTiles( origin_target ); + } + else { + assert( !nonuniform_nb ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = slate::Matrix::fromScaLAPACK( + m, n, &matrix.A_data[0], matrix.lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb) { + matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, + tileDevice, MPI_COMM_WORLD ); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = slate::Matrix::fromScaLAPACK( + m, n, &matrix.Aref_data[0], matrix.lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + } + } + + return matrix; +} + #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/test.cc b/test/test.cc index 2d46f782f..ebf397f78 100644 --- a/test/test.cc +++ b/test/test.cc @@ -404,7 +404,7 @@ Params::Params(): 0, 1000000, "(pt) max number of threads used in panel; default omp_num_threads / 2"), align ("align", 5, ParamType::List, 32, 1, 1024, "column alignment (sets lda, ldb, etc. to multiple of align)"), nonuniform_nb("nonuniform_nb", - 0, ParamType::Value, 'n', "ny", "generate matrix with nonuniform tile sizes"), + 0, ParamType::List, 'n', "ny", "generate matrix with nonuniform tile sizes"), debug ("debug", 0, ParamType::Value, -1, 0, 1000000, "given rank waits for debugger (gdb/lldb) to attach"), pivot_threshold( diff --git a/test/test_gesv.cc b/test/test_gesv.cc index 5a21e955d..d25240d01 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -9,6 +9,7 @@ #include "lapack/flops.hh" #include "print_matrix.hh" #include "grid_utils.hh" +#include "matrix_utils.hh" #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" @@ -60,7 +61,6 @@ void test_gesv_work(Params& params, bool run) int64_t nrhs = params.nrhs(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); - int64_t nb = params.nb(); int64_t ib = params.ib(); int64_t lookahead = params.lookahead(); int64_t panel_threads = params.panel_threads(); @@ -78,6 +78,8 @@ void test_gesv_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_Matrix( params ); + // Currently only gesv* supports timer_level >= 2. std::vector timer_lvl_support{ "gesv", "gesv_mixed", "gesv_mixed_gmres"}; @@ -154,20 +156,9 @@ void test_gesv_work(Params& params, bool run) if (! run) return; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - - if (nonuniform_nb) { - if (ref || origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; - return; - } - params.ref() = 'n'; - params.origin() = slate::Origin::Host; - ref = false; - origin = slate::Origin::Host; + if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; + return; } if ((params.routine == "gesv_mixed" || params.routine == "gesv_mixed_gmres") @@ -192,78 +183,23 @@ void test_gesv_work(Params& params, bool run) int64_t info = 0; - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // To generate matrix with non-uniform tile size using the lambda constructor - std::function< int64_t (int64_t j) > - tileNb = [nb](int64_t j) { - // for non-uniform tile size - return (j % 2 != 0 ? nb/2 : nb); - }; - - // rank and device functions for using the lambda constructor - auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, - p, num_devices_ ); - - std::vector A_data, B_data, X_data; - slate::Matrix A, B, X; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - if (nonuniform_nb) { - A = slate::Matrix(m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - B = slate::Matrix(n, nrhs, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - } - else { - A = slate::Matrix( - m, n, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - B = slate::Matrix( - n, nrhs, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - } - A.insertLocalTiles(origin_target); - B.insertLocalTiles(origin_target); - - if (is_iterative) { - X_data.resize(lldB*nlocB); - if (nonuniform_nb) { - X = slate::Matrix(n, nrhs, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - } - else { - X = slate::Matrix( - n, nrhs, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - } - X.insertLocalTiles(origin_target); - } + auto A_allocation = allocate_test_Matrix( check || ref, + m, n, params ); + auto B_allocation = allocate_test_Matrix( check || ref, + n, nrhs, params ); + TestMatrix> X_allocation; + if (is_iterative) { + X_allocation = allocate_test_Matrix( false, + n, nrhs, params ); } - else { - // Create SLATE matrix from the ScaLAPACK layouts - A_data.resize( lldA * nlocA ); - A = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - - B_data.resize( lldB * nlocB ); - B = slate::Matrix::fromScaLAPACK( - n, nrhs, &B_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - if (is_iterative) { - X_data.resize(lldB*nlocB); - X = slate::Matrix::fromScaLAPACK( - n, nrhs, &X_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - } - } + auto& A = A_allocation.A; + auto& Aref = A_allocation.Aref; + auto& Aref_data = A_allocation.Aref_data; + auto& B = B_allocation.A; + auto& Bref = B_allocation.Aref; + auto& Bref_data = B_allocation.Aref_data; + auto& X = X_allocation.A; slate::Pivots pivots; @@ -272,28 +208,7 @@ void test_gesv_work(Params& params, bool run) slate::generate_matrix(params.matrixB, B, matgen_opts); // If check/ref is required, copy test data. - slate::Matrix Aref, Bref; - std::vector Aref_data, Bref_data; if (check || ref) { - if (nonuniform_nb) { - Aref = slate::Matrix(m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - Bref = slate::Matrix(n, nrhs, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - Aref.insertLocalTiles( slate::Target::Host ); - Bref.insertLocalTiles( slate::Target::Host ); - } - else { - Aref_data.resize( lldA* nlocA ); - Bref_data.resize( lldB* nlocB ); - Aref = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - Bref = slate::Matrix::fromScaLAPACK( - n, nrhs, &Bref_data[0], lldB, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - slate::copy(A, Aref); slate::copy(B, Bref); } @@ -474,10 +389,15 @@ void test_gesv_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only if (nonuniform_nb) { - printf("Unsupported to test nonuniform tile size using scalapack\n"); + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; return; } + // MPI variables + int mpi_rank, myrow, mycol; + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + // BLACS/MPI variables blas_int ictxt, myrow_, mycol_, p_, q_; blas_int mpi_rank_ = 0, nprocs = 1; @@ -492,17 +412,19 @@ void test_gesv_work(Params& params, bool run) slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); + int nb = A_allocation.nb; + // ScaLAPACK descriptor for the reference matrix blas_int Aref_desc[9]; - scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info); + scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, A_allocation.mloc, &info); slate_assert(info == 0); blas_int Bref_desc[9]; - scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info); + scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, B_allocation.mloc, &info); slate_assert(info == 0); // ScaLAPACK data for pivots. - std::vector ipiv_ref(lldA + nb); + std::vector ipiv_ref(A_allocation.lld + nb); if (params.routine == "getrs") { // Factor matrix A. From 9d641bda562479a605cbc316471f2942e83945ee Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 13 Nov 2023 11:30:57 -0500 Subject: [PATCH 02/19] Improve robustness of BaseMatrix --- include/slate/BaseMatrix.hh | 34 ++++++++++++++++++++++--- include/slate/internal/MatrixStorage.hh | 9 ++++--- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh index f4c666184..d263a3bd7 100644 --- a/include/slate/BaseMatrix.hh +++ b/include/slate/BaseMatrix.hh @@ -904,8 +904,32 @@ BaseMatrix::BaseMatrix( ++nt_; } + // Pass variables into lambda w/out full this object + int64_t mt_1 = mt_-1; + int64_t last_mb = last_mb_; + int64_t nt_1 = nt_-1; + int64_t last_nb = last_nb_; + std::function + realTileMb = [inTileMb, last_mb, mt_1](int64_t i) { + if (i == mt_1) { + return last_mb; + } + else { + return inTileMb( i ); + } + }; + std::function + realTileNb = [inTileNb, last_nb, nt_1](int64_t j) { + if (j == nt_1) { + return last_nb; + } + else { + return inTileNb( j ); + } + }; + storage_ = std::make_shared< MatrixStorage< scalar_t > >( - mt_, nt_, inTileMb, inTileNb, + mt_, nt_, realTileMb, realTileNb, inTileRank, inTileDevice, mpi_comm ); slate_mpi_call( @@ -2686,6 +2710,11 @@ void BaseMatrix::tileGet(int64_t i, int64_t j, int dst_device, src_tile->layout() : Layout(layout); } + else { + target_layout = layout == LayoutConvert::None ? + tile_node[dst_device]->layout() : + Layout(layout); + } if (! tile_node.existsOn(dst_device)) { // Create a copy on the destination. @@ -2700,8 +2729,7 @@ void BaseMatrix::tileGet(int64_t i, int64_t j, int dst_device, tileCopyDataLayout( src_tile, dst_tile, target_layout, async ); dst_tile->state(MOSI::Shared); - if (src_tile->stateOn(MOSI::Modified)) - src_tile->state(MOSI::Shared); + src_tile->state(MOSI::Shared); // src was either shared or modified } if (modify) { tileModified(i, j, dst_device); diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh index 3257f8004..b2d8fb98c 100644 --- a/include/slate/internal/MatrixStorage.hh +++ b/include/slate/internal/MatrixStorage.hh @@ -107,10 +107,11 @@ public: void eraseOn(int device) { slate_assert(device >= -1 && device+1 < int(tiles_.size())); - if (tiles_[device+1] != nullptr) { - tiles_[device+1]->state(MOSI::Invalid); - delete tiles_[device+1]; - tiles_[device+1] = nullptr; + auto tile = tiles_[device+1]; + tiles_[device+1] = nullptr; + if (tile != nullptr) { + assert(!tile->stateOn(MOSI::OnHold)); + delete tile; --num_instances_; } } From 63bd0ed37d923df0375b20df6c8ceddf12019008 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Mon, 13 Nov 2023 13:28:24 -0500 Subject: [PATCH 03/19] Add nonuniform block size to GEMM tester --- test/matrix_utils.hh | 14 +++- test/test_gemm.cc | 160 +++++++++++++++++-------------------------- test/test_gesv.cc | 38 +++++----- 3 files changed, 93 insertions(+), 119 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 1ff2df78d..aab4d9469 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -8,6 +8,8 @@ #include "slate/slate.hh" +#include "scalapack_wrappers.hh" + //------------------------------------------------------------------------------ // Zero out B, then copy band matrix B from A. // B is stored as a non-symmetric matrix, so we can apply Q from left @@ -221,7 +223,15 @@ public: std::vector Aref_data; // ScaLAPACK configuration - int64_t mloc, nloc, lld, nb; + int64_t m, n, mloc, nloc, lld, nb; + + #ifdef SLATE_HAVE_SCALAPACK + void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) { + int64_t info; + scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info); + slate_assert(info == 0); + } + #endif }; // ----------------------------------------------------------------------------- @@ -272,6 +282,8 @@ TestMatrix> allocate_test_Matrix( // The object to be returned TestMatrix> matrix; + matrix.m = m; + matrix.n = n; // ScaLAPACK variables int mpi_rank, myrow, mycol; diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 8af202539..f9f1c18fe 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -8,6 +8,7 @@ #include "blas/flops.hh" #include "print_matrix.hh" #include "grid_utils.hh" +#include "matrix_utils.hh" #include "scalapack_wrappers.hh" #include "scalapack_support_routines.hh" @@ -43,7 +44,6 @@ void test_gemm_work(Params& params, bool run) int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); int64_t k = params.dim.k(); - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); int64_t lookahead = params.lookahead(); @@ -52,15 +52,18 @@ void test_gemm_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); - slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::Origin origin = params.origin(); slate::GridOrder grid_order = params.grid_order(); slate::Method method_gemm = params.method_gemm(); params.matrix.mark(); params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -74,6 +77,15 @@ void test_gemm_work(Params& params, bool run) if (! run) return; + #ifndef SLATE_HAVE_SCALAPACK + // Can only run ref when we have ScaLAPACK + if (ref) { + if (mpi_rank == 0) + printf( "ScaLAPACK not available\n" ); + ref = false; + } + #endif + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -91,77 +103,23 @@ void test_gemm_work(Params& params, bool run) int64_t Cm = m; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(n, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - // todo: C_data only if origin == ScaLAPACK? - C_data.resize( lldC * nlocC ); - } + auto A_alloc = allocate_test_Matrix( false, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, Cm, Cn, params ); - slate::Matrix A, B, C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix( - Am, An, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - A.insertLocalTiles( origin_target ); - - B = slate::Matrix( - Bm, Bn, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - B.insertLocalTiles( origin_target ); - - C = slate::Matrix( - Cm, Cn, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - C.insertLocalTiles( origin_target ); - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - C = slate::Matrix::fromScaLAPACK( - Cm, Cn, &C_data[0], lldC, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix(params.matrix, A); slate::generate_matrix(params.matrixB, B); slate::generate_matrix(params.matrixC, C); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - std::vector Cref_data; - slate::Matrix Cref; - if (ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Cref_data.resize( lldC * nlocC ); - Cref = slate::Matrix::fromScaLAPACK( - m, n, &Cref_data[0], lldC, nb, nb, grid_order, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif + // if reference run is required, copy test data. + if (ref) { + slate::copy( C, Cref ); + } if (transA == slate::Op::Trans) A = transpose(A); @@ -177,29 +135,29 @@ void test_gemm_work(Params& params, bool run) slate_assert(B.nt() == C.nt()); slate_assert(A.nt() == B.mt()); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, record norms to be used in the check/ref. - real_t A_norm=0, B_norm=0, C_orig_norm=0; - if (ref) { - A_norm = slate::norm(norm, A); - B_norm = slate::norm(norm, B); - C_orig_norm = slate::norm(norm, Cref); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { // Compute Y = alpha A * (B * X) + (beta C * X). - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( k, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, k, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Z = B * X; slate::multiply( one, B, X, zero, Z, opts ); @@ -246,6 +204,9 @@ void test_gemm_work(Params& params, bool run) } if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -263,36 +224,46 @@ void test_gemm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK + if (nonuniform_nb) { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + return; + } // BLACS/MPI variables + int mpi_rank, myrow, mycol; blas_int ictxt, p_, q_, myrow_, mycol_; blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); Cblacs_pinfo(&mpi_rank_, &nprocs); slate_assert( mpi_rank == mpi_rank_ ); slate_assert(p*q <= nprocs); Cblacs_get(-1, 0, &ictxt); Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); slate_assert( p == p_ ); slate_assert( q == q_ ); slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. copy(A, &A_data[0], A_desc); copy(B, &B_data[0], B_desc); @@ -338,9 +309,6 @@ void test_gemm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); #endif } } diff --git a/test/test_gesv.cc b/test/test_gesv.cc index d25240d01..19408010f 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -183,23 +183,20 @@ void test_gesv_work(Params& params, bool run) int64_t info = 0; - auto A_allocation = allocate_test_Matrix( check || ref, - m, n, params ); - auto B_allocation = allocate_test_Matrix( check || ref, - n, nrhs, params ); - TestMatrix> X_allocation; + auto A_alloc = allocate_test_Matrix( check || ref, m, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, n, nrhs, params ); + TestMatrix> X_alloc; if (is_iterative) { - X_allocation = allocate_test_Matrix( false, - n, nrhs, params ); + X_alloc = allocate_test_Matrix( false, n, nrhs, params ); } - auto& A = A_allocation.A; - auto& Aref = A_allocation.Aref; - auto& Aref_data = A_allocation.Aref_data; - auto& B = B_allocation.A; - auto& Bref = B_allocation.Aref; - auto& Bref_data = B_allocation.Aref_data; - auto& X = X_allocation.A; + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + auto& B = B_alloc.A; + auto& Bref = B_alloc.Aref; + auto& Bref_data = B_alloc.Aref_data; + auto& X = X_alloc.A; slate::Pivots pivots; @@ -412,19 +409,16 @@ void test_gesv_work(Params& params, bool run) slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int nb = A_allocation.nb; + int nb = A_alloc.nb; // ScaLAPACK descriptor for the reference matrix - blas_int Aref_desc[9]; - scalapack_descinit(Aref_desc, m, n, nb, nb, 0, 0, ictxt, A_allocation.mloc, &info); - slate_assert(info == 0); + blas_int Aref_desc[9], Bref_desc[9]; - blas_int Bref_desc[9]; - scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, B_allocation.mloc, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); // ScaLAPACK data for pivots. - std::vector ipiv_ref(A_allocation.lld + nb); + std::vector ipiv_ref(A_alloc.lld + nb); if (params.routine == "getrs") { // Factor matrix A. From 87e1558252da993d25695817ad3bca7487ec75e5 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 08:50:59 -0500 Subject: [PATCH 04/19] Add nonuniform tests to set --- test/matrix_utils.hh | 8 +-- test/test_add.cc | 132 ------------------------------------------- test/test_gemm.cc | 12 ++-- test/test_gesv.cc | 6 +- test/test_set.cc | 71 +++++++---------------- test/test_utils.hh | 111 ++++++++++++++++++++++++++++++++++++ 6 files changed, 145 insertions(+), 195 deletions(-) create mode 100644 test/test_utils.hh diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index aab4d9469..fbc3c2a22 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -249,8 +249,8 @@ inline void mark_params_for_test_Matrix(Params& params) // ----------------------------------------------------------------------------- /// Allocates a Matrix and a reference version for testing. /// -/// @param run[in] -/// Whether to actaully allocate the matrix, instead of just marking params +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix /// /// @param nonuniform_ref[in] /// If params.nonuniform_nb(), whether to also allocate the reference matrix @@ -268,6 +268,7 @@ inline void mark_params_for_test_Matrix(Params& params) template TestMatrix> allocate_test_Matrix( bool ref_matrix, + bool nonuniform_ref, int64_t m, int64_t n, Params& params) @@ -310,7 +311,6 @@ TestMatrix> allocate_test_Matrix( // SLATE allocates CPU or GPU tiles. slate::Target origin_target = origin2target( origin ); if (nonuniform_nb) { - matrix.A = slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); } @@ -331,7 +331,7 @@ TestMatrix> allocate_test_Matrix( // Setup reference matrix if (ref_matrix) { - if (nonuniform_nb) { + if (nonuniform_nb && nonuniform_ref) { matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD ); matrix.Aref.insertLocalTiles( slate::Target::Host ); diff --git a/test/test_add.cc b/test/test_add.cc index 90a8ab6a1..9f8703500 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -18,138 +18,6 @@ #include #include -// ----------------------------------------------------------------------------- -// subtract_matrix takes input matrices A and B, -// assumed to be of the same dimension, and performs the operation B = A - B. -// This was developed for checking slate::add without using slate::add to check. -// It is a CPU-only implementation and assumes column-major. - -template -void subtract_matrix( matrix_type& A, matrix_type& B ) -{ - using scalar_t = typename matrix_type::value_type; - int64_t mt = A.mt(); - int64_t nt = A.nt(); - if constexpr (std::is_same>::value) { - #pragma omp parallel for collapse(2) - for (int64_t j = 0; j < nt; ++j) { - for (int64_t i = 0; i < mt; ++i) { - if (A.tileIsLocal( i, j )) { - A.tileGetForReading( i, j, slate::LayoutConvert::None ); - B.tileGetForWriting( i, j, slate::LayoutConvert::None ); - auto TA = A( i, j ); - auto TB = B( i, j ); - int64_t mb = TA.mb(); - int64_t nb = TA.nb(); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii < mb; ++ii) { - TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ]; - } - } - } - } - } - } - else if (A.uploPhysical() == slate::Uplo::Upper) { - // todo: collapse(2) here requires OpenMP 5 - #pragma omp parallel for - for (int64_t i = 0; i < mt; ++i) { - for (int64_t j = i; j < nt; ++j) { - if (A.tileIsLocal( i, j )) { - auto Aij = A(i, j); - if (Aij.uploPhysical() == slate::Uplo::Upper) { - // Diagonal tiles are upper. - A.tileGetForReading( i, j, slate::LayoutConvert::None ); - B.tileGetForWriting( i, j, slate::LayoutConvert::None ); - auto TA = A( i, j ); - auto TB = B( i, j ); - int64_t mb = TA.mb(); - int64_t nb = TA.nb(); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii <= jj && ii < mb; ++ii) { // upper - TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ]; - } - } - } - else { - // Off-diagonal tiles are full. - A.tileGetForReading( i, j, slate::LayoutConvert::None ); - B.tileGetForWriting( i, j, slate::LayoutConvert::None ); - auto TA = A( i, j ); - auto TB = B( i, j ); - int64_t mb = TA.mb(); - int64_t nb = TA.nb(); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii < mb; ++ii) { - TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ]; - } - } - } - } - } - } - } - else if (A.uploPhysical() == slate::Uplo::Lower) { - // todo: collapse(2) here requires OpenMP 5 - #pragma omp parallel for - for (int64_t j = 0; j < nt; ++j) { - for (int64_t i = j; i < mt; ++i) { - if (A.tileIsLocal( i, j )) { - auto Aij = A(i, j); - if (Aij.uploPhysical() == slate::Uplo::Lower) { - // Diagonal tiles are lower. - A.tileGetForReading( i, j, slate::LayoutConvert::None ); - B.tileGetForWriting( i, j, slate::LayoutConvert::None ); - auto TA = A( i, j ); - auto TB = B( i, j ); - int64_t mb = TA.mb(); - int64_t nb = TA.nb(); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = jj; ii < mb; ++ii) { // lower - TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ]; - } - } - } - else { - // Off-diagonal tiles are full. - A.tileGetForReading( i, j, slate::LayoutConvert::None ); - B.tileGetForWriting( i, j, slate::LayoutConvert::None ); - auto TA = A( i, j ); - auto TB = B( i, j ); - int64_t mb = TA.mb(); - int64_t nb = TA.nb(); - int64_t lda = TA.stride(); - int64_t ldb = TB.stride(); - scalar_t const* TA_data = TA.data(); - scalar_t* TB_data = TB.data(); - for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii < mb; ++ii) { - TB_data[ ii + jj*ldb ] -= TA_data[ ii + jj*lda ]; - } - } - } - } - } - } - } -} - //------------------------------------------------------------------------------ template void test_add_work(Params& params, bool run) diff --git a/test/test_gemm.cc b/test/test_gemm.cc index f9f1c18fe..659186610 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -103,9 +103,9 @@ void test_gemm_work(Params& params, bool run) int64_t Cm = m; int64_t Cn = n; - auto A_alloc = allocate_test_Matrix( false, Am, An, params ); - auto B_alloc = allocate_test_Matrix( false, Bm, Bn, params ); - auto C_alloc = allocate_test_Matrix( ref, Cm, Cn, params ); + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); auto& A = A_alloc.A; auto& B = B_alloc.A; @@ -147,9 +147,9 @@ void test_gemm_work(Params& params, bool run) TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { // Compute Y = alpha A * (B * X) + (beta C * X). - X_alloc = allocate_test_Matrix( false, n, nrhs, params ); - Y_alloc = allocate_test_Matrix( false, m, nrhs, params ); - Z_alloc = allocate_test_Matrix( false, k, nrhs, params ); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, k, nrhs, params ); auto& X = X_alloc.A; auto& Y = Y_alloc.A; diff --git a/test/test_gesv.cc b/test/test_gesv.cc index 19408010f..ac5009036 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -183,11 +183,11 @@ void test_gesv_work(Params& params, bool run) int64_t info = 0; - auto A_alloc = allocate_test_Matrix( check || ref, m, n, params ); - auto B_alloc = allocate_test_Matrix( check || ref, n, nrhs, params ); + auto A_alloc = allocate_test_Matrix( check || ref, true, m, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, true, n, nrhs, params ); TestMatrix> X_alloc; if (is_iterative) { - X_alloc = allocate_test_Matrix( false, n, nrhs, params ); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); } auto& A = A_alloc.A; diff --git a/test/test_set.cc b/test/test_set.cc index bfd4bf8c4..68c2790c3 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -12,6 +12,7 @@ #include "print_matrix.hh" #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -28,9 +29,6 @@ void test_set_work(Params& params, bool run) using blas::imag; using slate::ceildiv; - // Constants - const scalar_t one = 1.0; - // get & mark input values slate::Uplo uplo; if (std::is_same< matrix_type, slate::Matrix >::value) @@ -51,17 +49,17 @@ void test_set_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -73,49 +71,21 @@ void test_set_work(Params& params, bool run) {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data; - - slate::Matrix Afull; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - Afull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Afull.insertLocalTiles( origin_target ); - } - else { - // Allocate ScaLAPACK data. - A_data.resize( lldA*nlocA ); - // Create SLATE matrix from the ScaLAPACK layout. - Afull = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - } - slate::generate_matrix( params.matrix, Afull ); + auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); - // Cast to desired matrix type. - matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag ); + auto& Afull = A_alloc.A; + auto& Aref_full = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + + slate::generate_matrix( params.matrix, Afull ); - // if reference run is required, copy test data - std::vector Aref_data; - slate::Matrix Aref_full; - matrix_type Aref; if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Aref_data.resize( lldA*nlocA ); - Aref_full = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Aref = matrix_cast< matrix_type >( Afull, uplo, diag ); - slate::copy( Afull, Aref_full ); + copy_matrix( Afull, Aref_full ); } + // Cast to desired matrix type. + matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag ); + if (trans == slate::Op::Trans) A = transpose( A ); else if (trans == slate::Op::ConjTrans) @@ -153,25 +123,26 @@ void test_set_work(Params& params, bool run) // comparison with reference routine from ScaLAPACK // BLACS/MPI variables + int mpi_rank, myrow, mycol; blas_int ictxt, p_, q_, myrow_, mycol_; blas_int A_desc[9]; blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); Cblacs_pinfo(&mpi_rank_, &nprocs); slate_assert( mpi_rank_ == mpi_rank ); slate_assert(p*q <= nprocs); Cblacs_get(-1, 0, &ictxt); Cblacs_gridinit(&ictxt, "Col", p, q); + gridinfo(mpi_rank, p, q, &myrow, &mycol); Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); slate_assert( p == p_ ); slate_assert( q == q_ ); slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::One, A ); @@ -182,6 +153,7 @@ void test_set_work(Params& params, bool run) //================================================== double time = barrier_get_wtime(MPI_COMM_WORLD); + blas_int info; scalapack_plaset( uplo2str(uplo), m, n, alpha, beta, &Aref_data[0], 1, 1, A_desc ); slate_assert(info == 0); @@ -195,10 +167,10 @@ void test_set_work(Params& params, bool run) // Do this on full m-by-n matrix to detect if on, say, // a lower triangular matrix, the kernel accidentally modifies // the upper triangle. - slate::add( -one, Aref_full, one, Afull ); - real_t A_diff_norm = slate::norm( slate::Norm::One, Afull ); + subtract_matrices( Afull, Aref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full ); - print_matrix( "A_diff_full", Afull, params ); + print_matrix( "A_diff_full", Aref_full, params ); real_t error = A_diff_norm / (n * A_norm); params.error() = error; @@ -207,7 +179,6 @@ void test_set_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( one ); if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); #endif diff --git a/test/test_utils.hh b/test/test_utils.hh new file mode 100644 index 000000000..8b4f39fd6 --- /dev/null +++ b/test/test_utils.hh @@ -0,0 +1,111 @@ +// Copyright (c) 2017-2023, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#ifndef SLATE_TEST_UTILS_HH +#define SLATE_TEST_UTILS_HH + +#include "slate/slate.hh" + +template +void matrix_iterator( matrix_type& A, matrix_type& B, + std::function thunk ) +{ + using scalar_t = typename matrix_type::value_type; + assert( A.m() == B.m() ); + assert( A.n() == B.n() ); + + int64_t B_mt = B.mt(); + int64_t B_nt = B.nt(); + if constexpr (std::is_same>::value) { + + int64_t A_j = 0, A_jj = 0; + for (int64_t B_j = 0; B_j < B_nt; ++B_j) { + + int64_t A_i = 0, A_ii = 0; + for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + #pragma omp task shared(A, B) \ + firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) + { + if (B.tileIsLocal( B_i, B_j )) { + A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), + slate::Layout::ColMajor ); + + A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor ); + B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor ); + auto TA = A( A_i, A_j ); + auto TB = B( B_i, B_j ); + int64_t mb = TB.mb(); + int64_t nb = TB.nb(); + assert( A_ii + mb <= TA.mb() ); + assert( A_jj + nb <= TA.nb() ); + int64_t lda = TA.stride(); + int64_t ldb = TB.stride(); + scalar_t const* TA_data = TA.data(); + scalar_t* TB_data = TB.data(); + for (int64_t jj = 0; jj < nb; ++jj) { + for (int64_t ii = 0; ii < mb; ++ii) { + thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ], + TB_data[ ii + jj*ldb ] ); + } + } + } + else if (A.tileIsLocal( A_i, A_j )) { + A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) ); + } + } + + A_ii += B.tileMb( B_i ); + assert( A_ii <= A.tileMb( A_i ) ); + if (A_ii == A.tileMb( A_i )) { + ++A_i; + A_ii = 0; + } + } + A_jj += B.tileNb( B_j ); + assert( A_jj <= A.tileNb( A_j ) ); + if (A_jj == A.tileNb( A_j )) { + ++A_j; + A_jj = 0; + } + } + } + else { + assert( false ); + } + + A.releaseRemoteWorkspace(); +} + +// ----------------------------------------------------------------------------- +// subtract_matrices takes input matrices A and B, and performs B = B - A. +// The matrices must have the same size, but can have different tile sizes and +// distributions. However, the elements of a tile of B must all belong to the +// same tile of A + +template +void subtract_matrices( matrix_type& A, matrix_type& B ) +{ + using scalar_t = typename matrix_type::value_type; + + matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } ); +} + +// ----------------------------------------------------------------------------- +// copy_matrix copies A to B +// The matrices must have the same size, but can have different tile sizes and +// distributions. However, the elements of a tile of B must all belong to the +// same tile of A + +template +void copy_matrix( matrix_type& A, matrix_type& B ) +{ + using scalar_t = typename matrix_type::value_type; + + matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b = a; } ); +} + + +#endif // SLATE_TEST_UTILS_HH From 9ff51bf99f8fc3eecd727747c5f369cd9b638cb1 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 09:00:01 -0500 Subject: [PATCH 05/19] Add nonuniform-nb to run_test and tweak name in tester --- test/run_tests.py | 36 +++++++++++++++++++----------------- test/test.cc | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index 0dd06e3e1..a80751fa8 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -123,6 +123,7 @@ group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' ) group_opt.add_argument( '--dev-dist', action='store', help='default=%(default)s', default='c,r' ) group_opt.add_argument( '--nb', action='store', help='default=%(default)s', default='64,100' ) +group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' ) group_opt.add_argument( '--nt', action='store', help='default=%(default)s', default='5,10,20' ) group_opt.add_argument( '--np', action='store', help='number of MPI processes; default=%(default)s', default='1' ) group_opt.add_argument( '--grid', action='store', help='use p-by-q MPI process grid', default='' ) @@ -298,6 +299,7 @@ la = ' --lookahead ' + opts.lookahead if (opts.lookahead) else '' ddist = ' --dev-dist ' + opts.dev_dist if (opts.dev_dist) else '' nb = ' --nb ' + opts.nb if (opts.nb) else '' +nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else '' nt = ' --nt ' + opts.nt if (opts.nt) else '' grid = ' --grid ' + opts.grid if (opts.grid) else '' repeat = ' --repeat ' + opts.repeat if (opts.repeat) else '' @@ -346,9 +348,9 @@ def filter_csv( values, csv ): cmds += [ [ 'gbmm', gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ], - [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC ], - [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC ], - [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC ], + [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], + [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], + [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], [ 'hemm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], # todo: hemmA GPU support @@ -384,21 +386,21 @@ def filter_csv( values, csv ): # LU if (opts.lu): cmds += [ - [ 'gesv', gen + dtype + la + n + thresh ], + [ 'gesv', gen + dtype + la + n + thresh + nonuniform_nb ], [ 'gesv_tntpiv', gen + dtype + la + n ], - [ 'gesv_nopiv', gen + dtype + la + n - + ' --matrix rand_dominant --nonuniform_nb n' ], + [ 'gesv_nopiv', gen + dtype + la + n + nonuniform_nb + + ' --matrix rand_dominant' ], # todo: mn - [ 'getrf', gen + dtype + la + n + thresh ], + [ 'getrf', gen + dtype + la + n + thresh + nonuniform_nb ], [ 'getrf_tntpiv', gen + dtype + la + n ], - [ 'getrf_nopiv', gen + dtype + la + n - + ' --matrix rand_dominant --nonuniform_nb n' ], + [ 'getrf_nopiv', gen + dtype + la + n + nonuniform_nb + + ' --matrix rand_dominant' ], - [ 'getrs', gen + dtype + la + n + trans + thresh ], + [ 'getrs', gen + dtype + la + n + trans + thresh + nonuniform_nb ], [ 'getrs_tntpiv', gen + dtype + la + n + trans ], - [ 'getrs_nopiv', gen + dtype + la + n + trans - + ' --matrix rand_dominant --nonuniform_nb n' ], + [ 'getrs_nopiv', gen + dtype + la + n + trans + nonuniform_nb + + ' --matrix rand_dominant' ], [ 'getri', gen + dtype + la + n ], [ 'getriOOP', gen + dtype + la + n ], @@ -646,11 +648,11 @@ def filter_csv( values, csv ): [ 'scale_row_col', gen + dtype + mn + equed ], - [ 'set', gen + dtype + mn + ab ], - [ 'tzset', gen + dtype + mn + ab + uplo ], - [ 'trset', gen + dtype + n + ab + uplo ], - [ 'syset', gen + dtype + n + ab + uplo ], - [ 'heset', gen + dtype + n + ab + uplo ], + [ 'set', gen + dtype + mn + ab + nonuniform_nb ], + [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + uplo ], + [ 'trset', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'syset', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'heset', gen + dtype + n + ab + nonuniform_nb + uplo ], ] # ------------------------------------------------------------------------------ diff --git a/test/test.cc b/test/test.cc index ebf397f78..078df162a 100644 --- a/test/test.cc +++ b/test/test.cc @@ -403,7 +403,7 @@ Params::Params(): panel_threads("pt", 2, ParamType::List, std::max( omp_get_max_threads() / 2, 1 ), 0, 1000000, "(pt) max number of threads used in panel; default omp_num_threads / 2"), align ("align", 5, ParamType::List, 32, 1, 1024, "column alignment (sets lda, ldb, etc. to multiple of align)"), - nonuniform_nb("nonuniform_nb", + nonuniform_nb("nonuniform-nb", 0, ParamType::List, 'n', "ny", "generate matrix with nonuniform tile sizes"), debug ("debug", 0, ParamType::Value, -1, 0, 1000000, "given rank waits for debugger (gdb/lldb) to attach"), From 59b604ca3584929ab4b281f72f4fb264a21d7d3f Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 09:30:24 -0500 Subject: [PATCH 06/19] Reduce layout conversions in getrf and getrf_tntpiv This seemed to be causing some issues for getrf w/ nonuniform tiles --- src/getrf.cc | 9 ++++----- src/getrf_tntpiv.cc | 12 ++++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/getrf.cc b/src/getrf.cc index cca880e4d..b9c2649ed 100644 --- a/src/getrf.cc +++ b/src/getrf.cc @@ -107,7 +107,7 @@ int64_t getrf( bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}}); } A.template listBcast( - bcast_list_A, Layout::ColMajor, tag_k ); + bcast_list_A, target_layout, tag_k ); // Root broadcasts the pivot to all ranks. // todo: Panel ranks send the pivots to the right. @@ -139,7 +139,7 @@ int64_t getrf( internal::trsm( Side::Left, one, std::move( Tkk ), A.sub(k, k, j, j), - priority_1, Layout::ColMajor, queue_jk1 ); + priority_1, target_layout, queue_jk1 ); // send A(k, j) across column A(k+1:mt-1, j) // todo: trsm still operates in ColMajor @@ -196,7 +196,7 @@ int64_t getrf( Side::Left, one, std::move( Tkk ), A.sub(k, k, k+1+lookahead, A_nt-1), - priority_0, Layout::ColMajor, queue_1 ); + priority_0, target_layout, queue_1 ); // send A(k, kl+1:A_nt-1) across A(k+1:mt-1, kl+1:nt-1) BcastList bcast_list_A; @@ -204,9 +204,8 @@ int64_t getrf( // send A(k, j) across column A(k+1:mt-1, j) bcast_list_A.push_back({k, j, {A.sub(k+1, A_mt-1, j, j)}}); } - // todo: trsm still operates in ColMajor A.template listBcast( - bcast_list_A, Layout::ColMajor, tag_kl1); + bcast_list_A, target_layout, tag_kl1); // A(k+1:mt-1, kl+1:nt-1) -= A(k+1:mt-1, k) * A(k, kl+1:nt-1) internal::gemm( diff --git a/src/getrf_tntpiv.cc b/src/getrf_tntpiv.cc index c7c4b7892..20b88b098 100644 --- a/src/getrf_tntpiv.cc +++ b/src/getrf_tntpiv.cc @@ -223,7 +223,7 @@ int64_t getrf_tntpiv( Side::Right, one, std::move(Tkk), A.sub( k+1, A_mt-1, k, k ), - priority_1, Layout::ColMajor, queue_0 ); + priority_1, target_layout, queue_0 ); } #pragma omp task depend(inout:column[k]) \ @@ -238,7 +238,7 @@ int64_t getrf_tntpiv( bcast_list.push_back({i, k, {A.sub(i, i, k+1, A_nt-1)}, tag}); } A.template listBcastMT( - bcast_list, Layout::ColMajor ); + bcast_list, target_layout ); } // update lookahead column(s), high priority @@ -262,13 +262,13 @@ int64_t getrf_tntpiv( internal::trsm( Side::Left, one, std::move( Tkk ), A.sub( k, k, j, j ), - priority_1, Layout::ColMajor, queue_jk1 ); + priority_1, target_layout, queue_jk1 ); // send A(k, j) across column A(k+1:mt-1, j) // todo: trsm still operates in ColMajor A.tileBcast( k, j, A.sub( k+1, A_mt-1, j, j ), - Layout::ColMajor, tag_j ); + target_layout, tag_j ); } #pragma omp task depend(in:column[k]) \ @@ -306,7 +306,7 @@ int64_t getrf_tntpiv( Side::Left, one, std::move( Tkk ), A.sub( k, k, k+1+lookahead, A_nt-1 ), - priority_0, Layout::ColMajor, queue_1 ); + priority_0, target_layout, queue_1 ); } #pragma omp task depend(inout:column[k+1+lookahead]) \ @@ -322,7 +322,7 @@ int64_t getrf_tntpiv( bcast_list.push_back({k, j, {A.sub(k+1, A_mt-1, j, j)}, tag}); } A.template listBcastMT( - bcast_list, Layout::ColMajor); + bcast_list, target_layout); } From c9c04779f53e191051624a3a3b11c2cc5b638a33 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 12:21:04 -0500 Subject: [PATCH 07/19] Fix handling on internal workspaces --- include/slate/BaseMatrix.hh | 3 ++- include/slate/internal/MatrixStorage.hh | 9 ++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh index d263a3bd7..cb48b5147 100644 --- a/include/slate/BaseMatrix.hh +++ b/include/slate/BaseMatrix.hh @@ -3487,6 +3487,7 @@ void BaseMatrix::tileLayoutConvert( batch_count = std::max(batch_count, int64_t(bucket->second.first.size())); } + allocateBatchArrays( batch_count, 1 ); lapack::Queue* queue = comm_queue(device); @@ -3497,7 +3498,7 @@ void BaseMatrix::tileLayoutConvert( batch_count = bucket->second.first.size(); scalar_t** array_dev = this->array_device(device); - scalar_t** work_array_dev = this->array_device(device) + batch_count; + scalar_t** work_array_dev = array_dev + batch_count; assert(array_dev != nullptr); assert(work_array_dev != nullptr); diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh index b2d8fb98c..5019bbaad 100644 --- a/include/slate/internal/MatrixStorage.hh +++ b/include/slate/internal/MatrixStorage.hh @@ -645,11 +645,10 @@ void MatrixStorage::allocateBatchArrays( // Free device arrays. blas::device_free(array_dev_[i][device], *queue); - // Free queues. - delete compute_queues_[i][device]; - - // Allocate queues. - compute_queues_[ i ][ device ] = new lapack::Queue( device ); + if (compute_queues_[ i ][ device ] == nullptr) { + // Allocate queues. + compute_queues_[ i ][ device ] = new lapack::Queue( device ); + } // Allocate host arrays; array_host_[i][device] From 6777f8a095c52d0891bcb3b5052dda997ea30465 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 13:20:49 -0500 Subject: [PATCH 08/19] Add nonuniform tiles to add testers --- test/test_add.cc | 86 ++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 61 deletions(-) diff --git a/test/test_add.cc b/test/test_add.cc index 9f8703500..b38228dc5 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -12,6 +12,7 @@ #include "print_matrix.hh" #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -28,9 +29,6 @@ void test_add_work(Params& params, bool run) using blas::imag; using slate::ceildiv; - // Constants - const scalar_t one = 1.0; - // get & mark input values slate::Uplo uplo; if (std::is_same< matrix_type, slate::Matrix >::value) @@ -51,17 +49,17 @@ void test_add_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -73,38 +71,16 @@ void test_add_work(Params& params, bool run) {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data, B_data; - - int64_t nlocB = nlocA, lldB = lldA; - - slate::Matrix Afull, Bfull; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - Afull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Afull.insertLocalTiles( origin_target ); - Bfull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Bfull.insertLocalTiles( origin_target ); - } - else { - // Allocate ScaLAPACK data. - A_data.resize( lldA*nlocA ); - B_data.resize( lldB*nlocB ); - // Create SLATE matrix from the ScaLAPACK layout. - Afull = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - Bfull = slate::Matrix::fromScaLAPACK( - m, n, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD ); - } + auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + + auto& Afull = A_alloc.A; + auto& Bfull = B_alloc.A; + auto& Aref_full = A_alloc.Aref; + auto& Bref_full = B_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + auto& Bref_data = B_alloc.Aref_data; + slate::generate_matrix( params.matrix, Afull ); slate::generate_matrix( params.matrix, Bfull ); @@ -113,22 +89,9 @@ void test_add_work(Params& params, bool run) matrix_type B = matrix_cast< matrix_type >( Bfull, uplo, diag ); // if reference run is required, copy test data - std::vector Aref_data, Bref_data; - slate::Matrix Aref_full, Bref_full; - matrix_type Aref, Bref; if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Aref_data.resize( lldA*nlocA ); - Aref_full = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Aref = matrix_cast< matrix_type >( Afull, uplo, diag ); - slate::copy( Afull, Aref_full ); - - Bref_data.resize( lldB*nlocB ); - Bref_full = slate::Matrix::fromScaLAPACK( - m, n, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - Bref = matrix_cast< matrix_type >( Bfull, uplo, diag ); - slate::copy( Bfull, Bref_full ); + copy_matrix( Afull, Aref_full ); + copy_matrix( Bfull, Bref_full ); } //if (trans == slate::Op::Trans) @@ -172,23 +135,24 @@ void test_add_work(Params& params, bool run) blas_int ictxt, p_, q_, myrow_, mycol_; blas_int A_desc[9], B_desc[9]; blas_int mpi_rank_ = 0, nprocs = 1; + int mpi_rank, myrow, mycol; // initialize BLACS and ScaLAPACK Cblacs_pinfo(&mpi_rank_, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); slate_assert( mpi_rank_ == mpi_rank ); slate_assert(p*q <= nprocs); Cblacs_get(-1, 0, &ictxt); Cblacs_gridinit(&ictxt, "Col", p, q); + gridinfo(mpi_rank, p, q, &myrow, &mycol); Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); slate_assert( p == p_ ); slate_assert( q == q_ ); slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - scalapack_descinit(B_desc, m, n, nb, nb, 0, 0, ictxt, lldB, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); real_t A_norm = slate::norm( slate::Norm::Max, A ); real_t B_norm = slate::norm( slate::Norm::Max, B ); @@ -201,6 +165,7 @@ void test_add_work(Params& params, bool run) //================================================== double time = barrier_get_wtime(MPI_COMM_WORLD); + int64_t info; if (uplo == slate::Uplo::General) { scalapack_pgeadd( op2str( trans ), m, n, alpha, &Aref_data[0], 1, 1, A_desc, @@ -223,10 +188,10 @@ void test_add_work(Params& params, bool run) // Do this on full m-by-n matrix to detect if on, say, // a lower triangular matrix, the kernel accidentally modifies // the upper triangle. - slate::add( -one, Aref_full, one, Afull ); - slate::add( -one, Bref_full, one, Bfull ); - real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull ); - real_t B_diff_norm = slate::norm( slate::Norm::Max, Bfull ); + subtract_matrices( Afull, Aref_full ); + subtract_matrices( Bfull, Bref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full ); + real_t B_diff_norm = slate::norm( slate::Norm::One, Bref_full ); print_matrix( "A_diff_full", Afull, params ); print_matrix( "B_diff_full", Bfull, params ); @@ -242,7 +207,6 @@ void test_add_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( one ); SLATE_UNUSED( trans ); if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); From 931c887a0478836ede01863d5dfcb8060f4e292c Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 13:29:02 -0500 Subject: [PATCH 09/19] Add nonuniform tiles to scale testers --- test/test_scale.cc | 62 +++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/test/test_scale.cc b/test/test_scale.cc index c1bfc985c..a26d66265 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -12,6 +12,7 @@ #include "print_matrix.hh" #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -28,9 +29,6 @@ void test_scale_work(Params& params, bool run) using blas::imag; using slate::ceildiv; - // Constants - const scalar_t one = 1.0; - // get & mark input values slate::Uplo uplo; if (std::is_same< matrix_type, slate::Matrix >::value) @@ -51,17 +49,17 @@ void test_scale_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -73,47 +71,20 @@ void test_scale_work(Params& params, bool run) {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data; - - slate::Matrix Afull; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - Afull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Afull.insertLocalTiles( origin_target ); - } - else { - // Allocate ScaLAPACK data. - A_data.resize( lldA*nlocA ); - // Create SLATE matrix from the ScaLAPACK layout. - Afull = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - } + auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + + auto& Afull = A_alloc.A; + auto& Aref_full = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + slate::generate_matrix( params.matrix, Afull ); // Cast to desired matrix type. matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag ); // if reference run is required, copy test data - std::vector Aref_data; - slate::Matrix Aref_full; - matrix_type Aref; if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Aref_data.resize( lldA*nlocA ); - Aref_full = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Aref = matrix_cast< matrix_type >( Afull, uplo, diag ); - slate::copy( Afull, Aref_full ); + copy_matrix( Afull, Aref_full ); } //if (trans == slate::Op::Trans) @@ -152,25 +123,26 @@ void test_scale_work(Params& params, bool run) // comparison with reference routine from ScaLAPACK // BLACS/MPI variables + int mpi_rank, myrow, mycol; blas_int ictxt, p_, q_, myrow_, mycol_; blas_int A_desc[9]; blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); Cblacs_pinfo(&mpi_rank_, &nprocs); slate_assert( mpi_rank_ == mpi_rank ); slate_assert(p*q <= nprocs); Cblacs_get(-1, 0, &ictxt); Cblacs_gridinit(&ictxt, "Col", p, q); + gridinfo(mpi_rank, p, q, &myrow, &mycol); Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); slate_assert( p == p_ ); slate_assert( q == q_ ); slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::Max, A ); @@ -181,6 +153,7 @@ void test_scale_work(Params& params, bool run) //================================================== double time = barrier_get_wtime(MPI_COMM_WORLD); + int64_t info; scalapack_plascl( uplo2str(uplo), alpha, beta, m, n, &Aref_data[0], 1, 1, A_desc, &info ); slate_assert(info == 0); @@ -194,8 +167,8 @@ void test_scale_work(Params& params, bool run) // Do this on full m-by-n matrix to detect if on, say, // a lower triangular matrix, the kernel accidentally modifies // the upper triangle. - slate::add( -one, Aref_full, one, Afull ); - real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull ); + subtract_matrices( Afull, Aref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full ); print_matrix( "A_diff_full", Afull, params ); @@ -208,7 +181,6 @@ void test_scale_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( one ); if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); #endif From 4a735e833fc7e5e962ebcb9dd0dad18a3867c27e Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 14:09:24 -0500 Subject: [PATCH 10/19] De-duplicate scalapack setup code --- test/grid_utils.hh | 31 +++++++++++++++++++++++++++++++ test/test_add.cc | 21 ++------------------- test/test_gemm.cc | 20 ++------------------ test/test_gesv.cc | 27 +++------------------------ test/test_scale.cc | 21 ++------------------- test/test_set.cc | 21 ++------------------- 6 files changed, 42 insertions(+), 99 deletions(-) diff --git a/test/grid_utils.hh b/test/grid_utils.hh index 945cddafc..b03a27a59 100644 --- a/test/grid_utils.hh +++ b/test/grid_utils.hh @@ -7,6 +7,7 @@ #define SLATE_GRID_UTILS_HH #include "slate/slate.hh" +#include "scalapack_wrappers.hh" #include @@ -56,4 +57,34 @@ inline void gridinfo( gridinfo( mpi_rank, slate::GridOrder::Col, p, q, my_row, my_col ); } +//------------------------------------------------------------------------------ +#ifdef SLATE_HAVE_SCALAPACK +// Sets up a BLACS context with the given grid +inline void create_ScaLAPACK_context( slate::GridOrder grid_order, + int p, int q, + blas_int* ictxt ) +{ + // BLACS/MPI variables + int mpi_rank, myrow, mycol; + blas_int p_, q_, myrow_, mycol_; + blas_int mpi_rank_ = 0, nprocs = 1; + + // initialize BLACS and ScaLAPACK + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + Cblacs_pinfo( &mpi_rank_, &nprocs ); + slate_assert( mpi_rank_ == mpi_rank ); + + Cblacs_get( -1, 0, ictxt ); + + slate_assert( p*q <= nprocs ); + Cblacs_gridinit( ictxt, grid_order2str( grid_order ), p, q ); + gridinfo( mpi_rank, p, q, &myrow, &mycol ); + Cblacs_gridinfo( *ictxt, &p_, &q_, &myrow_, &mycol_ ); + slate_assert( p == p_ ); + slate_assert( q == q_ ); + slate_assert( myrow == myrow_ ); + slate_assert( mycol == mycol_ ); +} +#endif // SLATE_HAVE_SCALAPACK + #endif // SLATE_GRID_UTILS_HH diff --git a/test/test_add.cc b/test/test_add.cc index b38228dc5..b2d9fe6e7 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -131,26 +131,9 @@ void test_add_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - int mpi_rank, myrow, mycol; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - + blas_int ictxt, A_desc[9], B_desc[9]; + create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 659186610..bd5425f72 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -229,25 +229,9 @@ void test_gemm_work(Params& params, bool run) return; } - // BLACS/MPI variables - int mpi_rank, myrow, mycol; - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q ); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + create_ScaLAPACK_context( grid_order, p, q, &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_gesv.cc b/test/test_gesv.cc index ac5009036..a19beadb8 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -390,35 +390,14 @@ void test_gesv_work(Params& params, bool run) return; } - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - - // BLACS/MPI variables - blas_int ictxt, myrow_, mycol_, p_, q_; - blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q ); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int nb = A_alloc.nb; - - // ScaLAPACK descriptor for the reference matrix - blas_int Aref_desc[9], Bref_desc[9]; - + blas_int ictxt, Aref_desc[9], Bref_desc[9]; + create_ScaLAPACK_context( grid_order, p, q, &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); // ScaLAPACK data for pivots. - std::vector ipiv_ref(A_alloc.lld + nb); + std::vector ipiv_ref(A_alloc.lld + A_alloc.nb); if (params.routine == "getrs") { // Factor matrix A. diff --git a/test/test_scale.cc b/test/test_scale.cc index a26d66265..a98ffcb2d 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -122,26 +122,9 @@ void test_scale_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - int mpi_rank, myrow, mycol; - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - + blas_int ictxt, A_desc[9]; + create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::Max, A ); diff --git a/test/test_set.cc b/test/test_set.cc index 68c2790c3..9cb89e5e8 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -122,26 +122,9 @@ void test_set_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - int mpi_rank, myrow, mycol; - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - + blas_int ictxt, A_desc[9]; + create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::One, A ); From 50b8fd135cf25a1e81a23da71c1190b80bf81a76 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 14:30:02 -0500 Subject: [PATCH 11/19] Add nonuniform tiles to scale_row_col tester --- test/grid_utils.hh | 2 +- test/test_scale_row_col.cc | 84 +++++++++----------------------------- 2 files changed, 21 insertions(+), 65 deletions(-) diff --git a/test/grid_utils.hh b/test/grid_utils.hh index b03a27a59..20ba6b9f6 100644 --- a/test/grid_utils.hh +++ b/test/grid_utils.hh @@ -78,7 +78,7 @@ inline void create_ScaLAPACK_context( slate::GridOrder grid_order, slate_assert( p*q <= nprocs ); Cblacs_gridinit( ictxt, grid_order2str( grid_order ), p, q ); - gridinfo( mpi_rank, p, q, &myrow, &mycol ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); Cblacs_gridinfo( *ictxt, &p_, &q_, &myrow_, &mycol_ ); slate_assert( p == p_ ); slate_assert( q == q_ ); diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index 6c11bb3e7..163862391 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -12,6 +12,7 @@ #include "print_matrix.hh" #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -29,9 +30,6 @@ void test_scale_row_col_work( Params& params, bool run ) using slate::ceildiv; using slate::Equed; - // Constants - const scalar_t one = 1.0; - // get & mark input values slate::Uplo uplo; if (std::is_same< matrix_type, slate::Matrix >::value) @@ -51,7 +49,6 @@ void test_scale_row_col_work( Params& params, bool run ) else { n = params.dim.n(); } - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; @@ -59,10 +56,11 @@ void test_scale_row_col_work( Params& params, bool run ) bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; int verbose = params.verbose(); - slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -74,47 +72,20 @@ void test_scale_row_col_work( Params& params, bool run ) {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); - gridinfo( mpi_rank, p, q, &myrow, &mycol ); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols( m, nb, myrow, p ); - int64_t nlocA = num_local_rows_cols( n, nb, mycol, q ); - int64_t lldA = blas::max( 1, mlocA ); // local leading dimension of A - std::vector A_data; - - slate::Matrix Afull; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - Afull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Afull.insertLocalTiles( origin_target ); - } - else { - // Allocate ScaLAPACK data. - A_data.resize( lldA*nlocA ); - // Create SLATE matrix from the ScaLAPACK layout. - Afull = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - } + auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + + auto& Afull = A_alloc.A; + auto& Aref_full = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + slate::generate_matrix( params.matrix, Afull ); // Cast to desired matrix type. matrix_type A = matrix_cast< matrix_type >( Afull, uplo, diag ); // if reference run is required, copy test data - std::vector Aref_data; - slate::Matrix Aref_full; - matrix_type Aref; if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Aref_data.resize( lldA*nlocA ); - Aref_full = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Aref = matrix_cast< matrix_type >( Afull, uplo, diag ); - slate::copy( Afull, Aref_full ); + copy_matrix( Afull, Aref_full ); } if (trans == slate::Op::Trans) @@ -165,30 +136,17 @@ void test_scale_row_col_work( Params& params, bool run ) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo( &mpi_rank_, &nprocs ); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert( p*q <= nprocs ); - Cblacs_get( -1, 0, &ictxt ); - Cblacs_gridinit( &ictxt, "Col", p, q ); - Cblacs_gridinfo( ictxt, &p_, &q_, &myrow_, &mycol_ ); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit( A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info ); - slate_assert( info == 0 ); + blas_int ictxt, A_desc[9]; + create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_max = slate::norm( slate::Norm::Max, A ); - std::vector Rlocal( mlocA ), Clocal( nlocA ); + std::vector Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc ); + + int myrow, mycol; + gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol ); // Copy local part of R. int64_t ii = 0, iilocal = 0; @@ -235,7 +193,6 @@ void test_scale_row_col_work( Params& params, bool run ) scalapack_plaqge( m, n, &Aref_data[0], 1, 1, A_desc, Rlocal.data(), Clocal.data(), rowcnd, colcnd, A_max, &equed_out ); - slate_assert( info == 0 ); time = barrier_get_wtime( MPI_COMM_WORLD ) - time; params.ref_time() = time; @@ -248,8 +205,8 @@ void test_scale_row_col_work( Params& params, bool run ) // Do this on full m-by-n matrix to detect if on, say, // a lower triangular matrix, the kernel accidentally modifies // the upper triangle. - slate::add( -one, Aref_full, one, Afull ); - real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull ); + subtract_matrices( Afull, Aref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full ); print_matrix( "A_diff_full", Afull, params ); int64_t i = blas::iamax( m, &R[ 0 ], 1 ); @@ -264,7 +221,7 @@ void test_scale_row_col_work( Params& params, bool run ) real_t eps = std::numeric_limits::epsilon(); params.okay() = (error <= 3*eps); - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf( "%% A_diff_norm %.2e, A_max %.2e, R_max %.2e," " C_max %.2e, eps %.2e, 3*eps %.2e, error %.2e\n", A_diff_norm, A_max, R_max, C_max, eps, 3*eps, error ); @@ -273,7 +230,6 @@ void test_scale_row_col_work( Params& params, bool run ) Cblacs_gridexit( ictxt ); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( one ); SLATE_UNUSED( verbose ); if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); From 1dfa15d57204cce7d810c48973319a75ca8538cd Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 15:28:42 -0500 Subject: [PATCH 12/19] Add nonuniform tiles to copy tester --- test/test_copy.cc | 102 ++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 76 deletions(-) diff --git a/test/test_copy.cc b/test/test_copy.cc index 5760b9f62..7f19380a2 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -12,6 +12,7 @@ #include "print_matrix.hh" #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -28,9 +29,6 @@ void test_copy_work(Params& params, bool run) using blas::imag; using slate::ceildiv; - // Constants - const scalar_t one = 1.0; - // get & mark input values slate::Uplo uplo; if (std::is_same< matrix_type, slate::Matrix >::value) @@ -49,17 +47,17 @@ void test_copy_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t nb = params.nb(); int64_t p = params.grid.m(); int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -76,34 +74,16 @@ void test_copy_work(Params& params, bool run) MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); gridinfo(mpi_rank, p, q, &myrow, &mycol); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data, B_data; - real_t A_norm, B_norm; - - int64_t nlocB = nlocA, lldB = lldA; - - slate::Matrix Afull, Bfull; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - Afull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Afull.insertLocalTiles( origin_target ); - Bfull = slate::Matrix( m, n, nb, p, q, MPI_COMM_WORLD); - Bfull.insertLocalTiles( origin_target ); - } - else { - // Allocate ScaLAPACK data. - A_data.resize( lldA*nlocA ); - B_data.resize( lldB*nlocB ); - // Create SLATE matrix from the ScaLAPACK layout. - Afull = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - Bfull = slate::Matrix::fromScaLAPACK( - m, n, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD ); - } + auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); + + auto& Afull = A_alloc.A; + auto& Bfull = B_alloc.A; + auto& Aref_full = A_alloc.Aref; + auto& Bref_full = B_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + auto& Bref_data = B_alloc.Aref_data; + slate::generate_matrix( params.matrix, Afull ); slate::generate_matrix( params.matrix, Bfull ); @@ -112,22 +92,9 @@ void test_copy_work(Params& params, bool run) matrix_type B = matrix_cast< matrix_type >( Bfull, uplo, diag ); // if reference run is required, copy test data - std::vector Aref_data, Bref_data; - slate::Matrix Aref_full, Bref_full; - matrix_type Aref, Bref; if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Aref_data.resize( lldA*nlocA ); - Aref_full = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Aref = matrix_cast< matrix_type >( Afull, uplo, diag ); - slate::copy( Afull, Aref_full ); - - Bref_data.resize( lldB*nlocB ); - Bref_full = slate::Matrix::fromScaLAPACK( - m, n, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - Bref = matrix_cast< matrix_type >( Bfull, uplo, diag ); - slate::copy( Bfull, Bref_full ); + copy_matrix( Afull, Aref_full ); + copy_matrix( Bfull, Bref_full ); } //if (trans == slate::Op::Trans) @@ -171,30 +138,14 @@ void test_copy_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - scalapack_descinit(B_desc, m, n, nb, nb, 0, 0, ictxt, lldB, &info); - slate_assert(info == 0); - - A_norm = slate::norm( slate::Norm::Max, A ); - B_norm = slate::norm( slate::Norm::Max, B ); + blas_int ictxt, A_desc[9], B_desc[9]; + create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + + real_t A_norm = slate::norm( slate::Norm::Max, A ); + real_t B_norm = slate::norm( slate::Norm::Max, B ); print_matrix( "Aref_full", Aref_full, params ); print_matrix( "Bref_full", Bref_full, params ); @@ -218,10 +169,10 @@ void test_copy_work(Params& params, bool run) // Do this on full m-by-n matrix to detect if on, say, // a lower triangular matrix, the kernel accidentally modifies // the upper triangle. - slate::add( -one, Aref_full, one, Afull ); - slate::add( -one, Bref_full, one, Bfull ); - real_t A_diff_norm = slate::norm( slate::Norm::Max, Afull ); - real_t B_diff_norm = slate::norm( slate::Norm::Max, Bfull ); + subtract_matrices( Afull, Aref_full ); + subtract_matrices( Bfull, Bref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full ); + real_t B_diff_norm = slate::norm( slate::Norm::Max, Bref_full ); print_matrix( "A_diff_full", Afull, params ); print_matrix( "B_diff_full", Bfull, params ); @@ -235,7 +186,6 @@ void test_copy_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( one ); SLATE_UNUSED( A_norm ); SLATE_UNUSED( B_norm ); if (mpi_rank == 0) From 2065590fc3bc18622f21d7c17715fc132506d830 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 15:45:04 -0500 Subject: [PATCH 13/19] Add nonuniform-nb to more tests in run_tests --- test/run_tests.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/test/run_tests.py b/test/run_tests.py index a80751fa8..4561e217f 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -406,8 +406,8 @@ def filter_csv( values, csv ): [ 'getriOOP', gen + dtype + la + n ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], - [ 'gesv_mixed', gen + dtype_double + la + n ], - [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' ], + [ 'gesv_mixed', gen + dtype_double + la + n + nonuniform_nb ], + [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ], [ 'gesv_rbt', gen + dtype + la + n ], ] @@ -628,25 +628,25 @@ def filter_csv( values, csv ): # aux if (opts.aux): cmds += [ - [ 'add', gen + dtype + mn + ab ], - [ 'tzadd', gen + dtype + mn + ab + uplo ], - [ 'tradd', gen + dtype + n + ab + uplo ], - [ 'syadd', gen + dtype + n + ab + uplo ], - [ 'headd', gen + dtype + n + ab + uplo ], - - [ 'copy', gen + dtype + mn ], - [ 'tzcopy', gen + dtype + mn + uplo ], - [ 'trcopy', gen + dtype + n + uplo ], - [ 'sycopy', gen + dtype + n + uplo ], - [ 'hecopy', gen + dtype + n + uplo ], - - [ 'scale', gen + dtype + mn + ab ], - [ 'tzscale', gen + dtype + mn + ab + uplo ], - [ 'trscale', gen + dtype + n + ab + uplo ], - [ 'syscale', gen + dtype + n + ab + uplo ], - [ 'hescale', gen + dtype + n + ab + uplo ], - - [ 'scale_row_col', gen + dtype + mn + equed ], + [ 'add', gen + dtype + mn + ab + nonuniform_nb ], + [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + uplo ], + [ 'tradd', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'syadd', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'headd', gen + dtype + n + ab + nonuniform_nb + uplo ], + + [ 'copy', gen + dtype + mn + nonuniform_nb ], + [ 'tzcopy', gen + dtype + mn + nonuniform_nb + uplo ], + [ 'trcopy', gen + dtype + n + nonuniform_nb + uplo ], + [ 'sycopy', gen + dtype + n + nonuniform_nb + uplo ], + [ 'hecopy', gen + dtype + n + nonuniform_nb + uplo ], + + [ 'scale', gen + dtype + mn + ab + nonuniform_nb ], + [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ], + [ 'trscale', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'syscale', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'hescale', gen + dtype + n + ab + nonuniform_nb + uplo ], + + [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ], [ 'set', gen + dtype + mn + ab + nonuniform_nb ], [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + uplo ], From 4b8386fa1760efd8d461d24af59003a7117bda51 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 16:13:11 -0500 Subject: [PATCH 14/19] Avoid extra function calls when unneeded --- include/slate/BaseMatrix.hh | 52 ++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh index cb48b5147..5e88b43e5 100644 --- a/include/slate/BaseMatrix.hh +++ b/include/slate/BaseMatrix.hh @@ -904,32 +904,36 @@ BaseMatrix::BaseMatrix( ++nt_; } - // Pass variables into lambda w/out full this object - int64_t mt_1 = mt_-1; - int64_t last_mb = last_mb_; - int64_t nt_1 = nt_-1; - int64_t last_nb = last_nb_; - std::function - realTileMb = [inTileMb, last_mb, mt_1](int64_t i) { - if (i == mt_1) { - return last_mb; - } - else { - return inTileMb( i ); - } - }; - std::function - realTileNb = [inTileNb, last_nb, nt_1](int64_t j) { - if (j == nt_1) { - return last_nb; - } - else { - return inTileNb( j ); - } - }; + auto actTileMb = inTileMb; + if (last_mb_ != actTileMb( mt_-1 )) { + // Pass variables into lambda w/out full this object + int64_t mt_1 = mt_-1; + int64_t last_mb = last_mb_; + actTileMb = [inTileMb, last_mb, mt_1](int64_t i) { + if (i == mt_1) { + return last_mb; + } + else { + return inTileMb( i ); + } + }; + } + auto actTileNb = inTileNb; + if (last_nb_ != actTileNb( nt_-1 )) { + int64_t nt_1 = nt_-1; + int64_t last_nb = last_nb_; + actTileNb = [inTileNb, last_nb, nt_1](int64_t j) { + if (j == nt_1) { + return last_nb; + } + else { + return inTileNb( j ); + } + }; + } storage_ = std::make_shared< MatrixStorage< scalar_t > >( - mt_, nt_, realTileMb, realTileNb, + mt_, nt_, actTileMb, actTileNb, inTileRank, inTileDevice, mpi_comm ); slate_mpi_call( From 917790b2b63317ceda460d78b391bc77e1540580 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Tue, 14 Nov 2023 16:23:20 -0500 Subject: [PATCH 15/19] Fix issue in scale_row_col tester --- test/matrix_utils.hh | 4 ++-- test/test_scale_row_col.cc | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index fbc3c2a22..f4f923bb1 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -286,13 +286,13 @@ TestMatrix> allocate_test_Matrix( matrix.m = m; matrix.n = n; - // ScaLAPACK variables + // ScaLAPACK variables for reference matrix int mpi_rank, myrow, mycol; MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); matrix.nb = nb; matrix.mloc = num_local_rows_cols( m, nb, myrow, p ); - matrix.nloc = num_local_rows_cols (n, nb, mycol, q ); + matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A // Functions for nonuniform tile sizes diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index 163862391..7c1ed4ffd 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -150,8 +150,8 @@ void test_scale_row_col_work( Params& params, bool run ) // Copy local part of R. int64_t ii = 0, iilocal = 0; - for (int64_t i = 0; i < A.mt(); ++i) { - int64_t mb_ = A.tileMb( i ); + for (int64_t i = 0; i < Aref_full.mt(); ++i) { + int64_t mb_ = Aref_full.tileMb( i ); if (i % p == myrow) { std::copy( &R[ ii ], &R[ ii + mb_ ], &Rlocal[ iilocal ] ); iilocal += mb_; @@ -162,8 +162,8 @@ void test_scale_row_col_work( Params& params, bool run ) // Copy local part of R. int64_t jj = 0, jjlocal = 0; - for (int64_t j = 0; j < A.nt(); ++j) { - int64_t nb_ = A.tileNb( j ); + for (int64_t j = 0; j < Aref_full.nt(); ++j) { + int64_t nb_ = Aref_full.tileNb( j ); if (j % q == mycol) { std::copy( &C[ jj ], &C[ jj + nb_ ], &Clocal[ jjlocal ] ); jjlocal += nb_; From e9cfd7d7f4db0d1ad8de1920ef148a06ba2d8866 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 15 Nov 2023 09:27:08 -0500 Subject: [PATCH 16/19] Remove some vestigial code --- test/test_copy.cc | 5 ----- test/test_set.cc | 2 -- 2 files changed, 7 deletions(-) diff --git a/test/test_copy.cc b/test/test_copy.cc index 7f19380a2..84721bb9d 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -69,11 +69,6 @@ void test_copy_work(Params& params, bool run) {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - auto A_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); auto B_alloc = allocate_test_Matrix( check || ref, false, m, n, params ); diff --git a/test/test_set.cc b/test/test_set.cc index 9cb89e5e8..2dc088f0a 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -136,10 +136,8 @@ void test_set_work(Params& params, bool run) //================================================== double time = barrier_get_wtime(MPI_COMM_WORLD); - blas_int info; scalapack_plaset( uplo2str(uplo), m, n, alpha, beta, &Aref_data[0], 1, 1, A_desc ); - slate_assert(info == 0); time = barrier_get_wtime(MPI_COMM_WORLD) - time; params.ref_time() = time; From 995e394d7c80aaa716ecf20a07fb52336b6e36e1 Mon Sep 17 00:00:00 2001 From: Neil Lindquist Date: Wed, 15 Nov 2023 09:33:45 -0500 Subject: [PATCH 17/19] Remove an assertion that broke assumptions in a few routines --- include/slate/internal/MatrixStorage.hh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/slate/internal/MatrixStorage.hh b/include/slate/internal/MatrixStorage.hh index 5019bbaad..cea953d4e 100644 --- a/include/slate/internal/MatrixStorage.hh +++ b/include/slate/internal/MatrixStorage.hh @@ -110,7 +110,6 @@ public: auto tile = tiles_[device+1]; tiles_[device+1] = nullptr; if (tile != nullptr) { - assert(!tile->stateOn(MOSI::OnHold)); delete tile; --num_instances_; } From b4b88f866792934166cc7f2e30f1e92fea889bc5 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Tue, 16 Jan 2024 14:54:47 -0500 Subject: [PATCH 18/19] Update docs, style. Use max-norm. --- include/slate/BaseMatrix.hh | 14 ++++++------ test/matrix_utils.hh | 22 +++++++++---------- test/test_add.cc | 12 ++++++---- test/test_gemm.cc | 2 +- test/test_utils.hh | 44 +++++++++++++++++++++++-------------- 5 files changed, 54 insertions(+), 40 deletions(-) diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh index 5e88b43e5..086aff8d0 100644 --- a/include/slate/BaseMatrix.hh +++ b/include/slate/BaseMatrix.hh @@ -906,7 +906,7 @@ BaseMatrix::BaseMatrix( auto actTileMb = inTileMb; if (last_mb_ != actTileMb( mt_-1 )) { - // Pass variables into lambda w/out full this object + // Pass variables into lambda w/out reference to `this`. int64_t mt_1 = mt_-1; int64_t last_mb = last_mb_; actTileMb = [inTileMb, last_mb, mt_1](int64_t i) { @@ -2710,14 +2710,14 @@ void BaseMatrix::tileGet(int64_t i, int64_t j, int dst_device, + " -> " + std::to_string(dst_device)); } - target_layout = layout == LayoutConvert::None ? - src_tile->layout() : - Layout(layout); + target_layout = layout == LayoutConvert::None + ? src_tile->layout() + : Layout(layout); } else { - target_layout = layout == LayoutConvert::None ? - tile_node[dst_device]->layout() : - Layout(layout); + target_layout = layout == LayoutConvert::None + ? tile_node[dst_device]->layout() + : Layout(layout); } if (! tile_node.existsOn(dst_device)) { diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index f4f923bb1..ea20fd584 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -206,7 +206,7 @@ matrix_type matrix_cast( } -// ----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Functions for allocating test matrices template @@ -234,7 +234,7 @@ public: #endif }; -// ----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// Marks the paramters used by allocate_test_Matrix inline void mark_params_for_test_Matrix(Params& params) { @@ -246,23 +246,23 @@ inline void mark_params_for_test_Matrix(Params& params) params.grid_order(); } -// ----------------------------------------------------------------------------- -/// Allocates a Matrix and a reference version for testing. +//------------------------------------------------------------------------------ +/// Allocates a Matrix and optionally a reference version for testing. /// -/// @param ref_matrix[in] +/// @param[in] ref_matrix /// Whether to allocate a reference matrix /// -/// @param nonuniform_ref[in] +/// @param[in] nonuniform_ref /// If params.nonuniform_nb(), whether to also allocate the reference matrix /// with non-uniform tiles. /// -/// @param m[in] +/// @param[in] m /// The number of rows /// -/// @param n[in] +/// @param[in] n /// The number of columns /// -/// @param params[in] +/// @param[in] params /// The test params object which contains many of the key parameters /// template @@ -295,10 +295,10 @@ TestMatrix> allocate_test_Matrix( matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A - // Functions for nonuniform tile sizes + // Functions for nonuniform tile sizes. + // Odd-numbered tiles are 2*nb, even-numbered tiles are nb. std::function< int64_t (int64_t j) > tileNb = [nb](int64_t j) { - // for non-uniform tile size return (j % 2 != 0 ? nb*2 : nb); }; auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); diff --git a/test/test_add.cc b/test/test_add.cc index b2d9fe6e7..3e51600b3 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -173,14 +173,18 @@ void test_add_work(Params& params, bool run) // the upper triangle. subtract_matrices( Afull, Aref_full ); subtract_matrices( Bfull, Bref_full ); - real_t A_diff_norm = slate::norm( slate::Norm::One, Aref_full ); - real_t B_diff_norm = slate::norm( slate::Norm::One, Bref_full ); + real_t A_diff_norm = slate::norm( slate::Norm::Max, Aref_full ); + real_t B_diff_norm = slate::norm( slate::Norm::Max, Bref_full ); print_matrix( "A_diff_full", Afull, params ); print_matrix( "B_diff_full", Bfull, params ); - real_t errorA = A_diff_norm / (n * A_norm); - real_t errorB = B_diff_norm / (n * B_norm); + // Ideally this would be element-wise error, + // max_ij( | A_ij – Aref_ij | / | Aref_ij | ), + // instead of norm-wise error, + // max_ij( | A_ij – Aref_ij | ) / max_ij( | Aref_ij | ). + real_t errorA = A_diff_norm / A_norm; + real_t errorB = B_diff_norm / B_norm; params.error() = errorA + errorB; real_t tol = params.tol() * std::numeric_limits::epsilon()/2; diff --git a/test/test_gemm.cc b/test/test_gemm.cc index bd5425f72..746e2f74d 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -78,7 +78,7 @@ void test_gemm_work(Params& params, bool run) return; #ifndef SLATE_HAVE_SCALAPACK - // Can only run ref when we have ScaLAPACK + // Can run ref only when we have ScaLAPACK. if (ref) { if (mpi_rank == 0) printf( "ScaLAPACK not available\n" ); diff --git a/test/test_utils.hh b/test/test_utils.hh index 8b4f39fd6..a946411b0 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -8,15 +8,25 @@ #include "slate/slate.hh" +///----------------------------------------------------------------------------- +/// Applies the operator thunk to each element of A and B to update B. +/// The matrices must have the same size, but can have different tile sizes and +/// distributions. However, the elements of a tile of B must all belong to the +/// same tile of A. For example, this is satisfied in the testers if B +/// has tiles of size nb and A has tiles of size nb or 2*nb. +/// template -void matrix_iterator( matrix_type& A, matrix_type& B, - std::function thunk ) +void matrix_iterator( + matrix_type& A, matrix_type& B, + std::function< void( typename matrix_type::value_type const&, + typename matrix_type::value_type& ) > thunk ) { using scalar_t = typename matrix_type::value_type; assert( A.m() == B.m() ); assert( A.n() == B.n() ); + const auto ColMajor = slate::LayoutConvert::ColMajor; + int64_t B_mt = B.mt(); int64_t B_nt = B.nt(); if constexpr (std::is_same>::value) { @@ -33,8 +43,8 @@ void matrix_iterator( matrix_type& A, matrix_type& B, A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), slate::Layout::ColMajor ); - A.tileGetForReading( A_i, A_j, slate::LayoutConvert::ColMajor ); - B.tileGetForWriting( B_i, B_j, slate::LayoutConvert::ColMajor ); + A.tileGetForReading( A_i, A_j, ColMajor ); + B.tileGetForWriting( B_i, B_j, ColMajor ); auto TA = A( A_i, A_j ); auto TB = B( B_i, B_j ); int64_t mb = TB.mb(); @@ -79,12 +89,12 @@ void matrix_iterator( matrix_type& A, matrix_type& B, A.releaseRemoteWorkspace(); } -// ----------------------------------------------------------------------------- -// subtract_matrices takes input matrices A and B, and performs B = B - A. -// The matrices must have the same size, but can have different tile sizes and -// distributions. However, the elements of a tile of B must all belong to the -// same tile of A - +///----------------------------------------------------------------------------- +/// subtract_matrices takes input matrices A and B, and performs B = B - A. +/// The matrices must have the same size, but can have different tile sizes and +/// distributions. However, the elements of a tile of B must all belong to the +/// same tile of A. +/// template void subtract_matrices( matrix_type& A, matrix_type& B ) { @@ -93,12 +103,12 @@ void subtract_matrices( matrix_type& A, matrix_type& B ) matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } ); } -// ----------------------------------------------------------------------------- -// copy_matrix copies A to B -// The matrices must have the same size, but can have different tile sizes and -// distributions. However, the elements of a tile of B must all belong to the -// same tile of A - +///----------------------------------------------------------------------------- +/// copy_matrix copies A to B +/// The matrices must have the same size, but can have different tile sizes and +/// distributions. However, the elements of a tile of B must all belong to the +/// same tile of A. +/// template void copy_matrix( matrix_type& A, matrix_type& B ) { From 2bb33f39421c9e196cdeff09d8b9a9c5acf0becd Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Tue, 16 Jan 2024 17:04:37 -0500 Subject: [PATCH 19/19] Add msg for nonuniform nb --- test/matrix_utils.hh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index ea20fd584..7d7f44a96 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -311,6 +311,8 @@ TestMatrix> allocate_test_Matrix( // SLATE allocates CPU or GPU tiles. slate::Target origin_target = origin2target( origin ); if (nonuniform_nb) { + params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) ) + + ", " + std::to_string( tileNb( 1 ) ); matrix.A = slate::Matrix( m, n, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); }