diff --git a/GNUmakefile b/GNUmakefile index 604394716..b71c10c0c 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -702,6 +702,7 @@ endif tester_src += \ test/matrix_generator.cc \ test/matrix_params.cc \ + test/matrix_utils.cc \ test/random.cc \ test/test.cc \ test/test_add.cc \ diff --git a/src/internal/internal_henorm.cc b/src/internal/internal_henorm.cc index 7b4eecd14..992287b73 100644 --- a/src/internal/internal_henorm.cc +++ b/src/internal/internal_henorm.cc @@ -196,12 +196,14 @@ void norm( // Sum tile results into local results. // Summing up local contributions only. std::fill_n(values, A.n(), 0.0); - int64_t nb0 = A.tileNb(0); - int64_t mb0 = A.tileMb(0); - // off-diagonal blocks + + jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + int64_t nb = A.tileNb( j ); + + // off-diagonal blocks + int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { - int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); if (A.tileIsLocal(i, j) && ( ( lower && i > j) || @@ -210,27 +212,26 @@ void norm( // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*i + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*i + jj ], 1, + &values[jj], 1); // row sums blas::axpy( mb, 1.0, - &tiles_sums[A.m()*j + i*nb0 ], 1, - &values[i*mb0], 1); + &tiles_sums[A.m()*j + ii ], 1, + &values[ii], 1); } + ii += mb; } - } - // diagonal blocks - for (int64_t j = 0; j < A.nt(); ++j) { - int64_t nb = A.tileNb(j); + // diagonal blocks if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*j + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*j + jj ], 1, + &values[jj], 1); } + jj += nb; } } //--------- @@ -345,8 +346,6 @@ void norm( std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -374,7 +373,7 @@ void norm( for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, ijrange ) \ + shared( vals_host_arrays, ijrange ) \ firstprivate(device, layout, lower, queue_index, in_norm, ldv) \ priority(priority) { diff --git a/src/internal/internal_synorm.cc b/src/internal/internal_synorm.cc index 9f814b898..79a5ba291 100644 --- a/src/internal/internal_synorm.cc +++ b/src/internal/internal_synorm.cc @@ -143,7 +143,7 @@ void norm( int64_t jj = 0; #pragma omp taskgroup for (int64_t j = 0; j < A.nt(); ++j) { - // diagonal tile + // diagonal tiles if (j < A.mt() && A.tileIsLocal(j, j)) { #pragma omp task slate_omp_default_none \ shared( A, tiles_sums ) \ @@ -190,17 +190,19 @@ void norm( } jj += A.tileNb(j); } - //end omp taskgroup + // end omp taskgroup // Sum tile results into local results. // Summing up local contributions only. std::fill_n(values, A.n(), 0.0); - int64_t nb0 = A.tileNb(0); - int64_t mb0 = A.tileMb(0); - // off-diagonal blocks + + jj = 0; for (int64_t j = 0; j < A.nt(); ++j) { + int64_t nb = A.tileNb( j ); + + // off-diagonal blocks + int64_t ii = 0; for (int64_t i = 0; i < A.mt(); ++i) { - int64_t nb = A.tileNb(j); int64_t mb = A.tileMb(i); if (A.tileIsLocal(i, j) && ( ( lower && i > j) || @@ -209,27 +211,26 @@ void norm( // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*i + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*i + jj ], 1, + &values[jj], 1); // row sums blas::axpy( mb, 1.0, - &tiles_sums[A.m()*j + i*nb0 ], 1, - &values[i*mb0], 1); + &tiles_sums[A.m()*j + ii ], 1, + &values[ii], 1); } + ii += mb; } - } - // diagonal blocks - for (int64_t j = 0; j < A.nt(); ++j) { - int64_t nb = A.tileNb(j); + // diagonal blocks if (A.tileIsLocal(j, j) ) { // col sums blas::axpy( nb, 1.0, - &tiles_sums[A.n()*j + j*nb0 ], 1, - &values[j*nb0], 1); + &tiles_sums[A.n()*j + jj ], 1, + &values[jj], 1); } + jj += nb; } } //--------- @@ -301,6 +302,7 @@ void norm( } } } + // end omp taskgroup } } @@ -325,7 +327,8 @@ void norm( /// @ingroup norm_internal /// template -void norm(internal::TargetType, +void norm( + internal::TargetType, Norm in_norm, NormScope scope, SymmetricMatrix& A, blas::real_type* values, int priority, int queue_index) @@ -347,8 +350,6 @@ void norm(internal::TargetType, std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -376,7 +377,7 @@ void norm(internal::TargetType, for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, ijrange ) \ + shared( vals_host_arrays, ijrange ) \ firstprivate(device, lower, queue_index, in_norm, ldv, layout) \ priority(priority) { @@ -493,11 +494,6 @@ void norm(internal::TargetType, } // end omp taskgroup - for (int device = 0; device < A.num_devices(); ++device) { - blas::Queue* queue = A.compute_queue(device, queue_index); - blas::device_free(vals_dev_arrays[device], *queue); - } - // Reduction over devices to local result. if (in_norm == Norm::Max) { *values = lapack::lange(in_norm, diff --git a/src/internal/internal_trnorm.cc b/src/internal/internal_trnorm.cc index 72e203c44..83cf429ae 100644 --- a/src/internal/internal_trnorm.cc +++ b/src/internal/internal_trnorm.cc @@ -365,8 +365,6 @@ void norm( std::vector > vals_host_arrays(A.num_devices()); - std::vector vals_dev_arrays(A.num_devices()); - // devices_values used for max and Frobenius norms. std::vector devices_values; @@ -399,7 +397,7 @@ void norm( for (int device = 0; device < A.num_devices(); ++device) { #pragma omp task slate_omp_default_none \ shared( A, devices_values ) \ - shared( vals_host_arrays, vals_dev_arrays, irange, jrange ) \ + shared( vals_host_arrays, irange, jrange ) \ firstprivate(device, queue_index, in_norm, ldv, layout) \ priority(priority) { diff --git a/test/matrix_utils.cc b/test/matrix_utils.cc new file mode 100644 index 000000000..c751bb302 --- /dev/null +++ b/test/matrix_utils.cc @@ -0,0 +1,407 @@ +// Copyright (c) 2017-2023, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "matrix_utils.hh" + +using nb_func_t = std::function< int64_t(int64_t) >; +using dist_func_t = std::function< int(std::tuple) >; + +//------------------------------------------------------------------------------ +/// Shared logic of the allocate_test_* routines +template +static TestMatrix allocate_test_shared( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params, + irregular_constructor_t construct_irregular, + regular_constructor_t construct_regular, + scalapack_constructor_t construct_scalapack) +{ + // Load params variables + int p = params.grid.m(); + int q = params.grid.n(); + int64_t nb = params.nb(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + slate::Origin origin = params.origin(); + slate::GridOrder grid_order = params.grid_order(); + slate::GridOrder dev_order = params.dev_order(); + + // The object to be returned + TestMatrix matrix( m, n, nb, p, q, grid_order ); + + // Functions for nonuniform tile sizes or row device distribution + nb_func_t tileNb; + if (nonuniform_nb) { + tileNb = [nb](int64_t j) { + // for non-uniform tile size + return (j % 2 != 0 ? nb*2 : nb); + }; + } + else { + // NB. we let BaseMatrix truncate the final tile length + // TrapezoidMatrix only takes 1 function for both dimensions (of different sizes) + tileNb = [nb](int64_t j) { + return nb; + }; + } + + auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); + int num_devices_ = blas::get_device_count(); + auto tileDevice = slate::func::device_1d_grid( dev_order, p, num_devices_ ); + + // Setup matrix to test SLATE with + if (origin != slate::Origin::ScaLAPACK) { + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target( origin ); + if (nonuniform_nb || dev_order == slate::GridOrder::Col) { + matrix.A = construct_irregular( tileNb, tileRank, tileDevice ); + } + else { + matrix.A = construct_regular( nb, grid_order, p, q ); + } + matrix.A.insertLocalTiles( origin_target ); + } + else { + assert( !nonuniform_nb ); + assert( dev_order == slate::GridOrder::Row ); + // Create SLATE matrix from the ScaLAPACK layouts + matrix.A_data.resize( matrix.lld * matrix.nloc ); + matrix.A = construct_scalapack( &matrix.A_data[0], matrix.lld, nb, + grid_order, p, q ); + } + + // Setup reference matrix + if (ref_matrix) { + if (nonuniform_nb && nonuniform_ref) { + matrix.Aref = construct_irregular( tileNb, tileRank, tileDevice ); + matrix.Aref.insertLocalTiles( slate::Target::Host ); + } + else { + matrix.Aref_data.resize( matrix.lld * matrix.nloc ); + matrix.Aref = construct_scalapack( &matrix.Aref_data[0], matrix.lld, nb, + grid_order, p, q ); + } + } + + return matrix; +} + +// ----------------------------------------------------------------------------- +/// Allocates a Matrix and optionally a reference version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] m +/// The number of rows +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params) +{ + auto construct_irregular = [&] (nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::Matrix( m, n, tileNb, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::Matrix( m, n, nb, nb, grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::Matrix::fromScaLAPACK( + m, n, data, lld, nb, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, m, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} +// Explicit instantiations. +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_Matrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_Matrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +//------------------------------------------------------------------------------ +/// Helper routine to avoid duplicating logic between HermitianMatrix +/// and SymmetricMatrix +/// +template +TestMatrix allocate_test_HeSyMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) +{ + using scalar_t = typename matrix_type::value_type; + + slate::Uplo uplo = params.uplo(); + + auto construct_irregular = [&] (nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return matrix_type( uplo, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return matrix_type( uplo, n, nb, grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return matrix_type::fromScaLAPACK( uplo, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared( + ref_matrix, nonuniform_ref, n, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} + +//------------------------------------------------------------------------------ +/// Allocates a HermitianMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_HermitianMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_HermitianMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +//------------------------------------------------------------------------------ +/// Allocates a SymmetricMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) +{ + return allocate_test_HeSyMatrix>( + ref_matrix, nonuniform_ref, n, params ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_SymmetricMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_SymmetricMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + + +//------------------------------------------------------------------------------ +/// Allocates a TriangularMatrix and optionally a reference +/// version for testing. +/// +/// @param[in] ref_matrix +/// Whether to allocate a reference matrix +/// +/// @param[in] nonuniform_ref +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param[in] n +/// The number of columns +/// +/// @param[in] params +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + slate::Diag diag = params.diag(); + + auto construct_irregular = [&] (nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::TriangularMatrix( uplo, diag, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::TriangularMatrix( uplo, diag, n, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::TriangularMatrix::fromScaLAPACK( + uplo, diag, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, n, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TriangularMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TriangularMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +//------------------------------------------------------------------------------ +/// Allocates a TrapezoidMatrix and a reference version for testing. +/// +/// @param ref_matrix[in] +/// Whether to allocate a reference matrix +/// +/// @param nonuniform_ref[in] +/// If params.nonuniform_nb(), whether to also allocate the reference matrix +/// with non-uniform tiles. +/// +/// @param m[in] +/// The number of rows +/// +/// @param n[in] +/// The number of columns +/// +/// @param params[in] +/// The test params object which contains many of the key parameters +/// +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params) +{ + // Load params variables + slate::Uplo uplo = params.uplo(); + slate::Diag diag = params.diag(); + + auto construct_irregular = [&] (nb_func_t tileNb, + dist_func_t tileRank, dist_func_t tileDevice) + { + return slate::TrapezoidMatrix( uplo, diag, m, n, tileNb, + tileRank, tileDevice, MPI_COMM_WORLD ); + }; + auto construct_regular = [&] (int64_t nb, slate::GridOrder grid_order, int p, int q ) + { + return slate::TrapezoidMatrix( uplo, diag, m, n, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + auto construct_scalapack = [&] (scalar_t* data, int64_t lld, int64_t nb, + slate::GridOrder grid_order, int p, int q ) + { + return slate::TrapezoidMatrix::fromScaLAPACK( + uplo, diag, m, n, data, lld, nb, + grid_order, p, q, MPI_COMM_WORLD ); + }; + + return allocate_test_shared>( + ref_matrix, nonuniform_ref, m, n, params, + construct_irregular, construct_regular, construct_scalapack ); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TrapezoidMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix>> allocate_test_TrapezoidMatrix>( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); diff --git a/test/matrix_utils.hh b/test/matrix_utils.hh index 7d7f44a96..b742b7ccf 100644 --- a/test/matrix_utils.hh +++ b/test/matrix_utils.hh @@ -7,8 +7,10 @@ #define SLATE_MATRIX_UTILS_HH #include "slate/slate.hh" +#include "test.hh" #include "scalapack_wrappers.hh" +#include "grid_utils.hh" //------------------------------------------------------------------------------ // Zero out B, then copy band matrix B from A. @@ -214,6 +216,20 @@ class TestMatrix { using scalar_t = typename MatrixType::value_type; public: + TestMatrix() {} + + TestMatrix(int64_t m_, int64_t n_, int64_t nb_, + int p_, int q_, slate::GridOrder grid_order_) + : m(m_), n(n_), nb(nb_), p(p_), q(q_), grid_order(grid_order_) + { + int mpi_rank, myrow, mycol; + MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + this->mloc = num_local_rows_cols( m, nb, myrow, p ); + this->nloc = num_local_rows_cols( n, nb, mycol, q ); + this->lld = blas::max( 1, mloc ); // local leading dimension of A + } + // SLATE matrices MatrixType A; MatrixType Aref; @@ -224,13 +240,22 @@ public: // ScaLAPACK configuration int64_t m, n, mloc, nloc, lld, nb; + int p, q; + slate::GridOrder grid_order; #ifdef SLATE_HAVE_SCALAPACK - void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) { + void ScaLAPACK_descriptor( blas_int ictxt, blas_int A_desc[9] ) + { int64_t info; scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mloc, &info); slate_assert(info == 0); } + + void create_ScaLAPACK_context( blas_int* ictxt ) + { + // Call free function version + ::create_ScaLAPACK_context( grid_order, p, q, ictxt ); + } #endif }; @@ -244,109 +269,60 @@ inline void mark_params_for_test_Matrix(Params& params) params.nonuniform_nb(); params.origin(); params.grid_order(); + params.dev_order(); } //------------------------------------------------------------------------------ -/// Allocates a Matrix and optionally a reference version for testing. -/// -/// @param[in] ref_matrix -/// Whether to allocate a reference matrix -/// -/// @param[in] nonuniform_ref -/// If params.nonuniform_nb(), whether to also allocate the reference matrix -/// with non-uniform tiles. -/// -/// @param[in] m -/// The number of rows -/// -/// @param[in] n -/// The number of columns -/// -/// @param[in] params -/// The test params object which contains many of the key parameters -/// -template -TestMatrix> allocate_test_Matrix( - bool ref_matrix, - bool nonuniform_ref, - int64_t m, - int64_t n, - Params& params) +/// Marks the paramters used by allocate_test_HermitianMatrix +inline void mark_params_for_test_HermitianMatrix(Params& params) { - // Load params variables - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); - int64_t nb = params.nb(); - bool nonuniform_nb = params.nonuniform_nb() == 'y'; - slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); - - // The object to be returned - TestMatrix> matrix; - matrix.m = m; - matrix.n = n; - - // ScaLAPACK variables for reference matrix - int mpi_rank, myrow, mycol; - MPI_Comm_rank( MPI_COMM_WORLD, &mpi_rank ); - gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); - matrix.nb = nb; - matrix.mloc = num_local_rows_cols( m, nb, myrow, p ); - matrix.nloc = num_local_rows_cols( n, nb, mycol, q ); - matrix.lld = blas::max( 1, matrix.mloc ); // local leading dimension of A - - // Functions for nonuniform tile sizes. - // Odd-numbered tiles are 2*nb, even-numbered tiles are nb. - std::function< int64_t (int64_t j) > - tileNb = [nb](int64_t j) { - return (j % 2 != 0 ? nb*2 : nb); - }; - auto tileRank = slate::func::process_2d_grid( grid_order, p, q ); - int num_devices_ = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, - p, num_devices_ ); - - // Setup matrix to test SLATE with - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target( origin ); - if (nonuniform_nb) { - params.msg() = "nonuniform nb " + std::to_string( tileNb( 0 ) ) - + ", " + std::to_string( tileNb( 1 ) ); - matrix.A = slate::Matrix( m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD); - } - else { - matrix.A = slate::Matrix( m, n, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - matrix.A.insertLocalTiles( origin_target ); - } - else { - assert( !nonuniform_nb ); - // Create SLATE matrix from the ScaLAPACK layouts - matrix.A_data.resize( matrix.lld * matrix.nloc ); - matrix.A = slate::Matrix::fromScaLAPACK( - m, n, &matrix.A_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } + params.uplo(); + mark_params_for_test_Matrix( params ); +} - // Setup reference matrix - if (ref_matrix) { - if (nonuniform_nb && nonuniform_ref) { - matrix.Aref = slate::Matrix( m, n, tileNb, tileNb, tileRank, - tileDevice, MPI_COMM_WORLD ); - matrix.Aref.insertLocalTiles( slate::Target::Host ); - } - else { - matrix.Aref_data.resize( matrix.lld * matrix.nloc ); - matrix.Aref = slate::Matrix::fromScaLAPACK( - m, n, &matrix.Aref_data[0], matrix.lld, nb, nb, - grid_order, p, q, MPI_COMM_WORLD ); - } - } +//------------------------------------------------------------------------------ +/// Marks the paramters used by allocate_test_SymmetricMatrix +inline void mark_params_for_test_SymmetricMatrix(Params& params) +{ + mark_params_for_test_HermitianMatrix( params ); +} + +//------------------------------------------------------------------------------ +/// Marks the paramters used by allocate_test_TriangularMatrix +inline void mark_params_for_test_TriangularMatrix(Params& params) +{ + params.uplo(); + params.diag(); + mark_params_for_test_Matrix( params ); +} - return matrix; +// ----------------------------------------------------------------------------- +/// Marks the paramters used by allocate_test_TrapezoidMatrix +inline void mark_params_for_test_TrapezoidMatrix(Params& params) +{ + params.uplo(); + params.diag(); + mark_params_for_test_Matrix( params ); } +template +TestMatrix> allocate_test_Matrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + +template +TestMatrix> allocate_test_HermitianMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_SymmetricMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_TriangularMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t n, Params& params); + +template +TestMatrix> allocate_test_TrapezoidMatrix( + bool ref_matrix, bool nonuniform_ref, int64_t m, int64_t n, Params& params); + #endif // SLATE_MATRIX_UTILS_HH diff --git a/test/run_tests.py b/test/run_tests.py index 4561e217f..574ecfc10 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -121,12 +121,13 @@ group_opt.add_argument( '--origin', action='store', help='default=%(default)s', default='s' ) group_opt.add_argument( '--target', action='store', help='default=%(default)s', default='t' ) group_opt.add_argument( '--lookahead', action='store', help='default=%(default)s', default='1' ) -group_opt.add_argument( '--dev-dist', action='store', help='default=%(default)s', default='c,r' ) group_opt.add_argument( '--nb', action='store', help='default=%(default)s', default='64,100' ) group_opt.add_argument( '--nonuniform-nb', action='store', help='default=%(default)s', default='n' ) group_opt.add_argument( '--nt', action='store', help='default=%(default)s', default='5,10,20' ) group_opt.add_argument( '--np', action='store', help='number of MPI processes; default=%(default)s', default='1' ) group_opt.add_argument( '--grid', action='store', help='use p-by-q MPI process grid', default='' ) +group_opt.add_argument( '--grid-order', action='store', help='default=%(default)s', default='' ) +group_opt.add_argument( '--dev-order', action='store', help='default=%(default)s', default='r,c' ) group_opt.add_argument( '--repeat', action='store', help='times to repeat each test', default='' ) group_opt.add_argument( '--thresh', action='store', help='default=%(default)s', default='1,0.5' ) group_opt.add_argument( '--matrix', action='store', help='default=%(default)s', default='' ) @@ -297,11 +298,12 @@ origin = ' --origin ' + opts.origin if (opts.origin) else '' target = ' --target ' + opts.target if (opts.target) else '' la = ' --lookahead ' + opts.lookahead if (opts.lookahead) else '' -ddist = ' --dev-dist ' + opts.dev_dist if (opts.dev_dist) else '' nb = ' --nb ' + opts.nb if (opts.nb) else '' nonuniform_nb = ' --nonuniform-nb ' + opts.nonuniform_nb if (opts.nonuniform_nb) else '' nt = ' --nt ' + opts.nt if (opts.nt) else '' grid = ' --grid ' + opts.grid if (opts.grid) else '' +grid_order = ' --grid-order ' + opts.grid_order if (opts.grid_order) else '' +dev_order = ' --dev-order ' + opts.dev_order if (opts.dev_order) else '' repeat = ' --repeat ' + opts.repeat if (opts.repeat) else '' thresh = ' --thresh ' + opts.thresh if (opts.thresh) else '' matrix = ' --matrix ' + opts.matrix if (opts.matrix) else '' @@ -314,6 +316,11 @@ gen_no_nb = origin + target + grid + check + ref + tol + repeat gen_no_target = grid + check + ref + tol + repeat + nb +ge_matrix = grid_order + dev_order +sy_matrix = grid_order + dev_order + uplo +he_matrix = grid_order + dev_order + uplo +tr_matrix = grid_order + dev_order + uplo + diag + if (opts.matrix): gen += matrix @@ -348,67 +355,67 @@ def filter_csv( values, csv ): cmds += [ [ 'gbmm', gen + dtype + la + transA + transB + mnk + ab + kl + ku + matrixBC ], - [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], - [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], - [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb ], + [ 'gemm', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], + [ 'gemmA', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], + [ 'gemmC', gen + dtype + la + transA + transB + mnk + ab + matrixBC + nonuniform_nb + ge_matrix ], - [ 'hemm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], + [ 'hemm', gen + dtype + la + side + he_matrix + mn + ab + matrixBC ], # todo: hemmA GPU support - [ 'hemmA', gen_no_target + dtype + la + side + uplo + mn + ab + matrixBC ], - [ 'hemmC', gen + dtype + la + side + uplo + mn + ab + matrixBC], + [ 'hemmA', gen_no_target + dtype + la + side + he_matrix + mn + ab + matrixBC ], + [ 'hemmC', gen + dtype + la + side + he_matrix + mn + ab + matrixBC], [ 'hbmm', gen + dtype + la + side + uplo + mn + ab + kd + matrixBC ], - [ 'herk', gen + dtype_real + la + uplo + trans + mn + ab + matrixC ], - [ 'herk', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixC ], + [ 'herk', gen + dtype_real + la + he_matrix + trans + mn + ab + matrixC ], + [ 'herk', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixC ], - [ 'her2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixBC ], - [ 'her2k', gen + dtype_complex + la + uplo + trans_nc + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_real + la + he_matrix + trans + mn + ab + matrixBC ], + [ 'her2k', gen + dtype_complex + la + he_matrix + trans_nc + mn + ab + matrixBC ], - [ 'symm', gen + dtype + la + side + uplo + mn + ab + matrixBC ], + [ 'symm', gen + dtype + la + side + sy_matrix + mn + ab + matrixBC ], - [ 'syr2k', gen + dtype_real + la + uplo + trans + mn + ab + matrixC ], - [ 'syr2k', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_real + la + sy_matrix + trans + mn + ab + matrixC ], + [ 'syr2k', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixC ], - [ 'syrk', gen + dtype_real + la + uplo + trans + mn + ab + matrixBC ], - [ 'syrk', gen + dtype_complex + la + uplo + trans_nt + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_real + la + sy_matrix + trans + mn + ab + matrixBC ], + [ 'syrk', gen + dtype_complex + la + sy_matrix + trans_nt + mn + ab + matrixBC ], # todo: tbsm fails for nb=8 or 16 with --quick. [ 'tbsm', gen_no_nb + ' --nb 32' + dtype + la + side + uplo + transA + diag + mn + a + kd + matrixB ], - [ 'trmm', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], + [ 'trmm', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], - [ 'trsm', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], - [ 'trsmA', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], - [ 'trsmB', gen + dtype + la + side + uplo + transA + diag + mn + a + matrixB ], + [ 'trsm', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], + [ 'trsmA', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], + [ 'trsmB', gen + dtype + la + side + tr_matrix + nonuniform_nb + transA + mn + a + matrixB ], ] # LU if (opts.lu): cmds += [ - [ 'gesv', gen + dtype + la + n + thresh + nonuniform_nb ], - [ 'gesv_tntpiv', gen + dtype + la + n ], - [ 'gesv_nopiv', gen + dtype + la + n + nonuniform_nb + [ 'gesv', gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ], + [ 'gesv_tntpiv', gen + dtype + la + n + ge_matrix ], + [ 'gesv_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], # todo: mn - [ 'getrf', gen + dtype + la + n + thresh + nonuniform_nb ], - [ 'getrf_tntpiv', gen + dtype + la + n ], - [ 'getrf_nopiv', gen + dtype + la + n + nonuniform_nb + [ 'getrf', gen + dtype + la + n + ge_matrix + nonuniform_nb + thresh ], + [ 'getrf_tntpiv', gen + dtype + la + n + ge_matrix ], + [ 'getrf_nopiv', gen + dtype + la + n + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], - [ 'getrs', gen + dtype + la + n + trans + thresh + nonuniform_nb ], - [ 'getrs_tntpiv', gen + dtype + la + n + trans ], - [ 'getrs_nopiv', gen + dtype + la + n + trans + nonuniform_nb + [ 'getrs', gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + thresh ], + [ 'getrs_tntpiv', gen + dtype + la + n + trans + ge_matrix ], + [ 'getrs_nopiv', gen + dtype + la + n + trans + ge_matrix + nonuniform_nb + ' --matrix rand_dominant' ], [ 'getri', gen + dtype + la + n ], [ 'getriOOP', gen + dtype + la + n ], #[ 'gerfs', gen + dtype + la + n + trans ], #[ 'geequ', gen + dtype + la + n ], - [ 'gesv_mixed', gen + dtype_double + la + n + nonuniform_nb ], - [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + nonuniform_nb ], - [ 'gesv_rbt', gen + dtype + la + n ], + [ 'gesv_mixed', gen + dtype_double + la + n + ge_matrix + nonuniform_nb ], + [ 'gesv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + ge_matrix + nonuniform_nb ], + [ 'gesv_rbt', gen + dtype + la + n + ge_matrix ], ] # LU banded @@ -424,14 +431,14 @@ def filter_csv( values, csv ): # Cholesky if (opts.chol): cmds += [ - [ 'posv', gen + dtype + la + n + uplo ], - [ 'potrf', gen + dtype + la + n + uplo + ddist ], - [ 'potrs', gen + dtype + la + n + uplo ], - [ 'potri', gen + dtype + la + n + uplo ], + [ 'posv', gen + dtype + la + n + he_matrix ], + [ 'potrf', gen + dtype + la + n + he_matrix ], + [ 'potrs', gen + dtype + la + n + he_matrix ], + [ 'potri', gen + dtype + la + n ], #[ 'porfs', gen + dtype + la + n + uplo ], #[ 'poequ', gen + dtype + la + n ], # only diagonal elements (no uplo) - [ 'posv_mixed', gen + dtype_double + la + n + uplo ], - [ 'posv_mixed_gmres', gen + dtype_double + la + n + uplo + ' --nrhs 1' ], + [ 'posv_mixed', gen + dtype_double + la + n + he_matrix ], + [ 'posv_mixed_gmres', gen + dtype_double + la + n + ' --nrhs 1' + he_matrix ], [ 'trtri', gen + dtype + la + n + uplo + diag ], ] @@ -585,9 +592,9 @@ def filter_csv( values, csv ): # svd if (opts.svd): if ('n' in jobu): - cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' ]] + cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu n --jobvt n' + ge_matrix ]] if ('v' in jobu): - cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' ]] + cmds += [[ 'svd', gen + dtype + la + n + mnk + ' --jobu v --jobvt v' + ge_matrix ]] cmds += [ # todo: mn (wide), nb, jobu, jobvt @@ -600,10 +607,10 @@ def filter_csv( values, csv ): # norms if (opts.norms): cmds += [ - [ 'genorm', gen + dtype + mn + norm ], - [ 'henorm', gen + dtype + n + norm + uplo ], - [ 'synorm', gen + dtype + n + norm + uplo ], - [ 'trnorm', gen + dtype + mn + norm + uplo + diag ], + [ 'genorm', gen + dtype + mn + norm + nonuniform_nb + ge_matrix ], + [ 'henorm', gen + dtype + n + norm + nonuniform_nb + he_matrix ], + [ 'synorm', gen + dtype + n + norm + nonuniform_nb + sy_matrix ], + [ 'trnorm', gen + dtype + mn + norm + nonuniform_nb + tr_matrix ], # Banded [ 'gbnorm', gen + dtype + mn + kl + ku + norm ], @@ -628,31 +635,31 @@ def filter_csv( values, csv ): # aux if (opts.aux): cmds += [ - [ 'add', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'tradd', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syadd', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'headd', gen + dtype + n + ab + nonuniform_nb + uplo ], - - [ 'copy', gen + dtype + mn + nonuniform_nb ], - [ 'tzcopy', gen + dtype + mn + nonuniform_nb + uplo ], - [ 'trcopy', gen + dtype + n + nonuniform_nb + uplo ], - [ 'sycopy', gen + dtype + n + nonuniform_nb + uplo ], - [ 'hecopy', gen + dtype + n + nonuniform_nb + uplo ], - - [ 'scale', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'trscale', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syscale', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'hescale', gen + dtype + n + ab + nonuniform_nb + uplo ], - - [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb ], - - [ 'set', gen + dtype + mn + ab + nonuniform_nb ], - [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + uplo ], - [ 'trset', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'syset', gen + dtype + n + ab + nonuniform_nb + uplo ], - [ 'heset', gen + dtype + n + ab + nonuniform_nb + uplo ], + [ 'add', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzadd', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'tradd', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syadd', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'headd', gen + dtype + n + ab + nonuniform_nb + he_matrix ], + + [ 'copy', gen + dtype + mn + nonuniform_nb + ge_matrix ], + [ 'tzcopy', gen + dtype + mn + nonuniform_nb + ge_matrix + uplo ], + [ 'trcopy', gen + dtype + n + nonuniform_nb + ge_matrix + uplo ], + [ 'sycopy', gen + dtype + n + nonuniform_nb + sy_matrix ], + [ 'hecopy', gen + dtype + n + nonuniform_nb + he_matrix ], + + [ 'scale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzscale', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'trscale', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syscale', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'hescale', gen + dtype + n + ab + nonuniform_nb + he_matrix ], + + [ 'scale_row_col', gen + dtype + mn + equed + nonuniform_nb + ge_matrix ], + + [ 'set', gen + dtype + mn + ab + nonuniform_nb + ge_matrix ], + [ 'tzset', gen + dtype + mn + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'trset', gen + dtype + n + ab + nonuniform_nb + ge_matrix + uplo ], + [ 'syset', gen + dtype + n + ab + nonuniform_nb + sy_matrix ], + [ 'heset', gen + dtype + n + ab + nonuniform_nb + he_matrix ], ] # ------------------------------------------------------------------------------ diff --git a/test/scalapack_support_routines.hh b/test/scalapack_support_routines.hh deleted file mode 100644 index 458dc7e42..000000000 --- a/test/scalapack_support_routines.hh +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright (c) 2009-2023, University of Tennessee. All rights reserved. -// Copyright (c) 2010, University of Denver, Colorado. -// SPDX-License-Identifier: BSD-3-Clause -// This program is free software: you can redistribute it and/or modify it under -// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. - -#ifndef SLATE_SCALAPACK_SUPPORT_HH -#define SLATE_SCALAPACK_SUPPORT_HH - -#include - -#include "scalapack_wrappers.hh" - -//------------------------------------------------------------------------------ -// Matrix generation -#define Rnd64_A 6364136223846793005ULL -#define Rnd64_C 1ULL -#define RndF_Mul 5.4210108624275222e-20f -#define RndD_Mul 5.4210108624275222e-20 - -typedef unsigned long long ull; - -static inline ull Rnd64_jump(ull n, ull seed) -{ - ull a_k, c_k, ran; - int64_t i; - - a_k = Rnd64_A; - c_k = Rnd64_C; - ran = seed; - for (i = 0; n; n >>= 1, ++i) { - if (n & 1) - ran = a_k*ran + c_k; - c_k *= (a_k + 1); - a_k *= a_k; - } - return ran; -} - -template -static inline void CORE_plrnt(int64_t m, int64_t n, scalar_t* A, int64_t lda, - int64_t bigM, int64_t m0, int64_t n0, ull seed) -{ - scalar_t* tmp = A; - int64_t i, j; - ull ran, jump; - - jump = (ull)m0 + (ull)n0*(ull)bigM; - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, (ull)seed); - for (i = 0; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += lda - i; - jump += bigM; - } -} - -template -static inline void CORE_plghe(scalar_t bump, int64_t m, int64_t n, scalar_t* A, int64_t lda, - int64_t gM, int64_t m0, int64_t n0, ull seed) -{ - scalar_t* tmp = A; - int64_t i, j; - ull ran, jump; - - jump = (ull)m0 + (ull)n0*(ull)gM; - /* Tile diagonal */ - if (m0 == n0) { - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, seed); - - for (i = j; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += (lda - i + j + 1); - jump += gM + 1; - } - for (j = 0; j < n; ++j) { - A[j + j*lda] += bump; - - for (i = 0; i < j; ++i) - A[lda*j + i] = A[lda*i + j]; - } - } - - /* Lower part */ - else if (m0 > n0) { - for (j = 0; j < n; ++j) { - ran = Rnd64_jump(jump, seed); - - for (i = 0; i < m; ++i) { - *tmp = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - ++tmp; - } - tmp += (lda - i); - jump += gM; - } - } - - /* Upper part */ - else if (m0 < n0) { - /* Overwrite jump */ - jump = (ull)n0 + (ull)m0*(ull)gM; - - for (i = 0; i < m; ++i) { - ran = Rnd64_jump(jump, seed); - - for (j = 0; j < n; ++j) { - A[j*lda + i] = 0.5f - ran*RndF_Mul; - ran = Rnd64_A*ran + Rnd64_C; - } - jump += gM; - } - } -} - -template -static void scalapack_pplrnt(scalar_t* A, - int64_t m, int64_t n, - int64_t mb, int64_t nb, - blas_int myrow, blas_int mycol, - blas_int nprow, blas_int npcol, - int64_t lldA, - int64_t seed) -{ - blas_int idum1, idum2, iloc, jloc, i0 = 0; - blas_int tempm, tempn; - scalar_t* Ab; - blas_int mb_ = blas_int( mb ); - blas_int nb_ = blas_int( nb ); - - // #pragma omp parallel for - for (blas_int i = 1; i <= m; i += mb) { - for (blas_int j = 1; j <= n; j += nb) { - if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) && - (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) { - iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow); - jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol); - - Ab = &A[(jloc - 1)*lldA + (iloc - 1) ]; - tempm = (m - i + 1) > mb ? mb : (m - i + 1); - tempn = (n - j + 1) > nb ? nb : (n - j + 1); - CORE_plrnt(tempm, tempn, Ab, lldA, - m, (i - 1), (j - 1), seed); - } - } - } -} - - -template -static void scalapack_pplghe(scalar_t* A, - int64_t m, int64_t n, - int64_t mb, int64_t nb, - blas_int myrow, blas_int mycol, - blas_int nprow, blas_int npcol, - int64_t lldA, - int64_t seed) -{ - blas_int idum1, idum2, iloc, jloc, i0 = 0; - int64_t tempm, tempn; - scalar_t* Ab; - scalar_t bump = (scalar_t)m; - blas_int mb_ = blas_int( mb ); - blas_int nb_ = blas_int( nb ); - - // #pragma omp parallel for - for (blas_int i = 1; i <= m; i += mb) { - for (blas_int j = 1; j <= n; j += nb) { - if ((myrow == scalapack_indxg2p(&i, &mb_, &idum1, &i0, &nprow)) && - (mycol == scalapack_indxg2p(&j, &nb_, &idum1, &i0, &npcol))) { - iloc = scalapack_indxg2l(&i, &mb_, &idum1, &idum2, &nprow); - jloc = scalapack_indxg2l(&j, &nb_, &idum1, &idum2, &npcol); - - Ab = &A[(jloc - 1)*lldA + (iloc - 1) ]; - tempm = (m - i + 1) > mb ? mb : (m - i + 1); - tempn = (n - j + 1) > nb ? nb : (n - j + 1); - CORE_plghe(bump, tempm, tempn, Ab, lldA, - m, (i - 1), (j - 1), seed); - } - } - } -} - -#endif // SLATE_SCALAPACK_SUPPORT_HH diff --git a/test/test.cc b/test/test.cc index 078df162a..3ba473dc2 100644 --- a/test/test.cc +++ b/test/test.cc @@ -356,8 +356,8 @@ Params::Params(): method_lu ("lu", 5, ParamType::List, slate::MethodLU::PartialPiv, str2methodLU, methodLU2str, "PartialPiv, CALU, NoPiv"), method_trsm ("trsm", 4, ParamType::List, 0, str2methodTrsm, methodTrsm2str, "auto=auto, A=trsmA, B=trsmB"), - grid_order("go", 3, ParamType::List, slate::GridOrder::Col, str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"), - dev_dist ("dev-dist",9, ParamType::List, slate::Dist::Col, str2dist, dist2str, "matrix tiles distribution across local devices (one-dimensional block-cyclic): col=column, row=row"), + grid_order("go", 3, ParamType::List, slate::GridOrder::Col, str2grid_order, grid_order2str, "(go) MPI grid order: c=Col, r=Row"), + dev_order ("do", 3, ParamType::List, slate::GridOrder::Row, str2grid_order, grid_order2str, "(do) Device grid order: c=Col, r=Row"), // name, w, type, default, char2enum, enum2char, enum2str, help layout ("layout", 6, ParamType::List, slate::Layout::ColMajor, blas::char2layout, blas::layout2char, blas::layout2str, "layout: r=row major, c=column major"), @@ -463,6 +463,7 @@ Params::Params(): lookahead.name("la", "lookahead"); panel_threads.name("pt", "panel-threads"); grid_order.name("go", "grid-order"); + dev_order.name("do", "dev-order"); // Change name for the methods to use less space in the stdout method_cholQR.name("cholQR", "method-cholQR"); diff --git a/test/test.hh b/test/test.hh index b0d48e0f3..8dec12510 100644 --- a/test/test.hh +++ b/test/test.hh @@ -21,15 +21,10 @@ // ----------------------------------------------------------------------------- namespace slate { -enum class Origin { - Host, - ScaLAPACK, - Devices, -}; - -enum class Dist { - Row, - Col, +enum class Origin : char { + Host = 'H', + ScaLAPACK = 'S', + Devices = 'D', }; } // namespace slate @@ -88,7 +83,7 @@ public: testsweeper::ParamEnum< slate::Method > method_trsm; testsweeper::ParamEnum< slate::GridOrder > grid_order; - testsweeper::ParamEnum< slate::Dist > dev_dist; + testsweeper::ParamEnum< slate::GridOrder > dev_order; // ----- test matrix parameters MatrixParams matrix; @@ -284,32 +279,6 @@ void test_scale (Params& params, bool run); void test_scale_row_col(Params& params, bool run); void test_set (Params& params, bool run); -// ----------------------------------------------------------------------------- -inline slate::Dist str2dist(const char* dist) -{ - std::string distribution_ = dist; - std::transform( - distribution_.begin(), - distribution_.end(), - distribution_.begin(), ::tolower); - if (distribution_ == "row" || distribution_ == "r") - return slate::Dist::Row; - else if (distribution_ == "col" || distribution_ == "c" - || distribution_ == "column") - return slate::Dist::Col; - else - throw slate::Exception("unknown distribution"); -} - -inline const char* dist2str(slate::Dist dist) -{ - switch (dist) { - case slate::Dist::Row: return "row"; - case slate::Dist::Col: return "col"; - } - return "?"; -} - // ----------------------------------------------------------------------------- inline slate::Origin str2origin(const char* origin) { diff --git a/test/test_add.cc b/test/test_add.cc index 3e51600b3..0596c73f8 100644 --- a/test/test_add.cc +++ b/test/test_add.cc @@ -7,10 +7,9 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" #include "test_utils.hh" @@ -49,8 +48,6 @@ void test_add_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -67,6 +64,11 @@ void test_add_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; @@ -133,7 +135,7 @@ void test_add_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); @@ -195,7 +197,7 @@ void test_add_work(Params& params, bool run) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( trans ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_bdsqr.cc b/test/test_bdsqr.cc index 8e4ac041f..b3ffa96f0 100644 --- a/test/test_bdsqr.cc +++ b/test/test_bdsqr.cc @@ -7,7 +7,6 @@ #include "blas.hh" #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "internal/internal.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_copy.cc b/test/test_copy.cc index 84721bb9d..d7b3bf0fc 100644 --- a/test/test_copy.cc +++ b/test/test_copy.cc @@ -7,10 +7,9 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" #include "test_utils.hh" @@ -47,8 +46,6 @@ void test_copy_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -65,6 +62,11 @@ void test_copy_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; @@ -135,7 +137,7 @@ void test_copy_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); @@ -183,7 +185,7 @@ void test_copy_work(Params& params, bool run) #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( A_norm ); SLATE_UNUSED( B_norm ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_gbnorm.cc b/test/test_gbnorm.cc index 883d07aff..7ac9a510d 100644 --- a/test/test_gbnorm.cc +++ b/test/test_gbnorm.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "print_matrix.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_gecondest.cc b/test/test_gecondest.cc index 9bb9badad..8f365bee8 100644 --- a/test/test_gecondest.cc +++ b/test/test_gecondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gelqf.cc b/test/test_gelqf.cc index 915d67455..68a84747c 100644 --- a/test/test_gelqf.cc +++ b/test/test_gelqf.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gels.cc b/test/test_gels.cc index 9ad6a3bbc..2d20e213a 100644 --- a/test/test_gels.cc +++ b/test/test_gels.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gemm.cc b/test/test_gemm.cc index 746e2f74d..8add9e9c5 100644 --- a/test/test_gemm.cc +++ b/test/test_gemm.cc @@ -7,11 +7,12 @@ #include "test.hh" #include "blas/flops.hh" #include "print_matrix.hh" + #include "grid_utils.hh" #include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include @@ -44,19 +45,15 @@ void test_gemm_work(Params& params, bool run) int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); int64_t k = params.dim.k(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t lookahead = params.lookahead(); bool ref_only = params.ref() == 'o'; slate::Norm norm = params.norm(); bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); slate::Target target = params.target(); slate::Origin origin = params.origin(); - slate::GridOrder grid_order = params.grid_order(); slate::Method method_gemm = params.method_gemm(); params.matrix.mark(); params.matrixB.mark(); @@ -77,14 +74,10 @@ void test_gemm_work(Params& params, bool run) if (! run) return; - #ifndef SLATE_HAVE_SCALAPACK - // Can run ref only when we have ScaLAPACK. - if (ref) { - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - ref = false; - } - #endif + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } slate::Options const opts = { {slate::Option::Lookahead, lookahead}, @@ -116,9 +109,14 @@ void test_gemm_work(Params& params, bool run) slate::generate_matrix(params.matrixB, B); slate::generate_matrix(params.matrixC, C); - // if reference run is required, copy test data. + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; if (ref) { slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); } if (transA == slate::Op::Trans) @@ -135,14 +133,6 @@ void test_gemm_work(Params& params, bool run) slate_assert(B.nt() == C.nt()); slate_assert(A.nt() == B.mt()); - // If reference run is required, record norms to be used in the check/ref. - real_t A_norm=0, B_norm=0, C_orig_norm=0; - if (ref) { - A_norm = slate::norm(norm, A); - B_norm = slate::norm(norm, B); - C_orig_norm = slate::norm(norm, Cref); - } - // If check run, perform first half of SLATE residual check. TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { @@ -224,14 +214,10 @@ void test_gemm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); diff --git a/test/test_genorm.cc b/test/test_genorm.cc index 5437d9429..41c03cee5 100644 --- a/test/test_genorm.cc +++ b/test/test_genorm.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -39,12 +40,17 @@ void test_genorm_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -52,41 +58,26 @@ void test_genorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); + auto A_alloc = allocate_test_Matrix( ref_copy, false, m, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (check || ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - } + slate::generate_matrix(params.matrix, A); - slate::Matrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix(params.matrix, A); - std::vector values; if (scope == slate::NormScope::Columns) { values.resize(A.n()); @@ -136,33 +127,21 @@ void test_genorm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK) { + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + copy(A, &A_data[0], A_desc); } // allocate work space - std::vector worklange(std::max(mlocA, nlocA)); + std::vector worklange(std::max(A_alloc.mloc, A_alloc.nloc)); // (Sca)LAPACK norms don't support trans; map One <=> Inf norm. slate::Norm op_norm = norm; @@ -199,10 +178,6 @@ void test_genorm_work(Params& params, bool run) } time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lange( - // op_norm, - // m, n, &A_data[0], lldA); - if (scope == slate::NormScope::Matrix) { // difference between norms error = std::abs(A_norm - A_norm_ref) / A_norm_ref; @@ -216,7 +191,7 @@ void test_genorm_work(Params& params, bool run) error /= sqrt(m*n); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -238,107 +213,113 @@ void test_genorm_work(Params& params, bool run) //---------- extended tests if (extended && scope == slate::NormScope::Matrix) { - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if mt or nt is small, so check in loops. - int64_t mt = A.mt(); - int64_t nt = A.nt(); - std::set i_indices = { 0, 1, mt - 2, mt - 1 }; - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - i_indices.insert(rand() % mt); - j_indices.insert(rand() % nt); + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : i_indices) { - if (i < 0 || i >= mt) + else { + + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if mt or nt is small, so check in loops. + int64_t mt = A.mt(); + int64_t nt = A.nt(); + std::set i_indices = { 0, 1, mt - 2, mt - 1 }; + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + i_indices.insert(rand() % mt); + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; - int64_t ib = std::min(m - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) - continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib) { + for (auto i : i_indices) { + if (i < 0 || i >= mt) + continue; + int64_t ib = std::min(m - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) continue; - } - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib) { + continue; + } - A_norm = slate::norm(norm, A, opts); + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the genorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } - A_norm_ref = scalapack_plange( - norm2str(norm), m, n, - &A_data[0], 1, 1, A_desc, - &worklange[0]); + A_norm = slate::norm(norm, A, opts); - // difference between norms - error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One) { - error /= sqrt(m); - } - else if (norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(m*n); - } + A_norm_ref = scalapack_plange( + norm2str(norm), m, n, + &A_data[0], 1, 1, A_desc, + &worklange[0]); - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One) { + error /= sqrt(m); + } + else if (norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(m*n); } - } - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } + } + + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the genorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } } @@ -351,7 +332,7 @@ void test_genorm_work(Params& params, bool run) SLATE_UNUSED( A_norm ); SLATE_UNUSED( extended ); SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_geqrf.cc b/test/test_geqrf.cc index 6ae7d8302..18c8bf114 100644 --- a/test/test_geqrf.cc +++ b/test/test_geqrf.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_gesv.cc b/test/test_gesv.cc index a19beadb8..1e7b00541 100644 --- a/test/test_gesv.cc +++ b/test/test_gesv.cc @@ -8,11 +8,11 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include @@ -59,8 +59,6 @@ void test_gesv_work(Params& params, bool run) int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t ib = params.ib(); int64_t lookahead = params.lookahead(); int64_t panel_threads = params.panel_threads(); @@ -68,13 +66,10 @@ void test_gesv_work(Params& params, bool run) bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; bool trace = params.trace() == 'y'; - bool nonuniform_nb = params.nonuniform_nb() == 'y'; int verbose = params.verbose(); int timer_level = params.timer_level(); SLATE_UNUSED(verbose); - slate::Origin origin = params.origin(); slate::Target target = params.target(); - slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); params.matrixB.mark(); @@ -156,8 +151,8 @@ void test_gesv_work(Params& params, bool run) if (! run) return; - if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { - params.msg() = "skipping: nonuniform tile not supported with ScaLAPACK"; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { return; } @@ -385,14 +380,10 @@ void test_gesv_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only - if (nonuniform_nb) { - params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; - return; - } // initialize BLACS and ScaLAPACK blas_int ictxt, Aref_desc[9], Bref_desc[9]; - create_ScaLAPACK_context( grid_order, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); @@ -433,7 +424,7 @@ void test_gesv_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_getri.cc b/test/test_getri.cc index 6a0343e52..3693dcba1 100644 --- a/test/test_getri.cc +++ b/test/test_getri.cc @@ -10,7 +10,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hb2st.cc b/test/test_hb2st.cc index 5c3f4bce7..607250fe1 100644 --- a/test/test_hb2st.cc +++ b/test/test_hb2st.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" #include "grid_utils.hh" -#include "scalapack_support_routines.hh" #include #include diff --git a/test/test_hbnorm.cc b/test/test_hbnorm.cc index c62de133a..bf5c5cd2c 100644 --- a/test/test_hbnorm.cc +++ b/test/test_hbnorm.cc @@ -7,7 +7,6 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" #include "band_utils.hh" diff --git a/test/test_heev.cc b/test/test_heev.cc index 505c1c0f3..fc7077f2a 100644 --- a/test/test_heev.cc +++ b/test/test_heev.cc @@ -9,7 +9,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hegst.cc b/test/test_hegst.cc index 93d78d275..35574087d 100644 --- a/test/test_hegst.cc +++ b/test/test_hegst.cc @@ -10,7 +10,6 @@ #include "print_matrix.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "auxiliary/Debug.hh" #include "grid_utils.hh" diff --git a/test/test_hegv.cc b/test/test_hegv.cc index d342be443..f6061b0a1 100644 --- a/test/test_hegv.cc +++ b/test/test_hegv.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_hemm.cc b/test/test_hemm.cc index 946a60798..d874c40ac 100644 --- a/test/test_hemm.cc +++ b/test/test_hemm.cc @@ -8,10 +8,11 @@ #include "blas/flops.hh" #include "print_matrix.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -42,10 +43,7 @@ void test_hemm_work(Params& params, bool run) int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -59,6 +57,9 @@ void test_hemm_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -72,6 +73,11 @@ void test_hemm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -85,93 +91,48 @@ void test_hemm_work(Params& params, bool run) // sizes of data int64_t An = (side == slate::Side::Left ? m : n); - int64_t Am = An; int64_t Bm = m; int64_t Bn = n; int64_t Cm = m; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } - - slate::HermitianMatrix A; - slate::Matrix B, C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::HermitianMatrix(uplo, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); + auto A_alloc = allocate_test_HermitianMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); - C = slate::Matrix(Cm, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::Matrix::fromScaLAPACK( - Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A); slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - std::vector Cref_data; - slate::Matrix Cref; - if (check || ref) { - // For simplicity, always use ScaLAPACK format for ref matrices. - Cref_data.resize( lldC * nlocC ); - Cref = slate::Matrix::fromScaLAPACK( - m, n, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy(C, Cref); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); if (side == slate::Side::Left ) { // Compute Y = alpha A * (B * X) + (beta C * X). @@ -237,6 +198,9 @@ void test_hemm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -255,56 +219,31 @@ void test_hemm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. copy(A, &A_data[0], A_desc); copy(B, &B_data[0], B_desc); copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocA, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocA + mlocA + ldw); - std::vector worklange(std::max({mlocC, nlocC, mlocB, nlocB})); - - // get norms of the original data - real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An, - &A_data[0], 1, 1, A_desc, &worklansy[0]); - real_t B_norm = scalapack_plange( - norm2str(norm), Bm, Bn, &B_data[0],1, 1, B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plange( - norm2str(norm), Cm, Cn, &Cref_data[0], 1, 1, Cref_desc, &worklange[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -315,12 +254,13 @@ void test_hemm_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); + + print_matrix( "Diff", C, params ); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm @@ -336,7 +276,7 @@ void test_hemm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_henorm.cc b/test/test_henorm.cc index 08b5fd4bc..a9a24a9fe 100644 --- a/test/test_henorm.cc +++ b/test/test_henorm.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -36,12 +37,17 @@ void test_henorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_HermitianMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -51,43 +57,25 @@ void test_henorm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); + auto A_alloc = allocate_test_HermitianMatrix( ref_copy, false, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } - - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::HermitianMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); + slate::generate_matrix(params.matrix, A); - slate::HermitianMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::HermitianMatrix(uplo, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + if (ref_copy) { + copy_matrix( A, Aref ); } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - } - - slate::generate_matrix( params.matrix, A ); print_matrix("A", A, params); @@ -109,41 +97,31 @@ void test_henorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); - } - - if (check || ref) { + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); + } + // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; std::vector worklanhe( lwork ); + // comparison with reference routine from ScaLAPACK + //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -153,10 +131,6 @@ void test_henorm_work(Params& params, bool run) n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lanhe( - // norm, A.uplo(), - // n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One || norm == slate::Norm::Inf) { @@ -166,7 +140,7 @@ void test_henorm_work(Params& params, bool run) error /= n; // = sqrt( n*n ); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -184,140 +158,131 @@ void test_henorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; - std::vector worklanhe(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); - } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) - continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); + } + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) continue; - } - - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } - - A_norm = slate::norm(norm, A, opts); - - real_t A_norm_ref = scalapack_planhe( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); - - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } - - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) + continue; + + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } + + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } + + A_norm = slate::norm(norm, A, opts); + + A_norm_ref = scalapack_planhe( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklanhe[0]); + + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } + } + + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } - - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } } } } } - } - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if ((check || ref) && mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - #endif + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- diff --git a/test/test_her2k.cc b/test/test_her2k.cc index a2af3cef2..7fe3ed067 100644 --- a/test/test_her2k.cc +++ b/test/test_her2k.cc @@ -8,10 +8,11 @@ #include "print_matrix.hh" #include "blas/flops.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,10 +39,7 @@ void test_her2k_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); real_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -53,6 +51,9 @@ void test_her2k_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -66,6 +67,11 @@ void test_her2k_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -81,72 +87,30 @@ void test_her2k_work(Params& params, bool run) int64_t An = (trans == slate::Op::NoTrans ? k : n); int64_t Bm = Am; int64_t Bn = An; - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector< scalar_t > A_data(lldA*nlocA); - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - std::vector< scalar_t > B_data(lldB*nlocB); - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - std::vector< scalar_t > C_data(lldC*nlocC); - - slate::Matrix A, B; - slate::HermitianMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::HermitianMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_HermitianMatrix( ref, true, Cn, params ); + + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - slate::HermitianMatrix Cref; - std::vector< scalar_t > Cref_data; - if (ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - print_matrix("Initial Cref", Cref, params); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A and B matrices, // and make a shallow copy of them for transposing. @@ -172,17 +136,19 @@ void test_her2k_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = (alpha A (B^H X)) + (conj(alpha) B (A^H X)) + (beta C X). // Y = beta C X @@ -221,6 +187,9 @@ void test_her2k_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -239,54 +208,30 @@ void test_her2k_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - copy( C, &C_data[0], C_desc ); - - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy( 2*nlocC + mlocC + ldw ); - std::vector worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) ); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, - A_desc, &worklange[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, - B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + + // Copy SLATE result back from GPU or CPU tiles. + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + copy(C, &C_data[0], C_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -300,13 +245,11 @@ void test_her2k_work(Params& params, bool run) print_matrix("Cref", Cref, params); - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm @@ -323,7 +266,7 @@ void test_her2k_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_herk.cc b/test/test_herk.cc index 8e5660241..2c86f7500 100644 --- a/test/test_herk.cc +++ b/test/test_herk.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "blas/flops.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -35,10 +36,7 @@ void test_herk_work(Params& params, bool run) int64_t k = params.dim.k(); real_t alpha = params.alpha.get(); real_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -47,7 +45,10 @@ void test_herk_work(Params& params, bool run) slate::Origin origin = params.origin(); slate::Target target = params.target(); params.matrix.mark(); - params.matrixB.mark(); + params.matrixC.mark(); + + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); // mark non-standard output values params.time(); @@ -62,6 +63,11 @@ void test_herk_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -75,64 +81,26 @@ void test_herk_work(Params& params, bool run) // setup so op(A) is n-by-k int64_t Am = (transA == slate::Op::NoTrans ? n : k); int64_t An = (transA == slate::Op::NoTrans ? k : n); - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - C_data.resize( lldC * nlocC ); - } + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto C_alloc = allocate_test_HermitianMatrix( ref, true, Cn, params ); - slate::Matrix A; - slate::HermitianMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - C = slate::HermitianMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - C = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); - slate::generate_matrix( params.matrixB, C ); - - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - slate::HermitianMatrix Cref; - std::vector Cref_data; - if (ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::HermitianMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif + slate::generate_matrix( params.matrixC, C ); + + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -147,17 +115,19 @@ void test_herk_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = alpha A (A^H X) + (beta C X). // Y = beta C X @@ -189,6 +159,9 @@ void test_herk_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -207,48 +180,26 @@ void test_herk_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); - copy( C, &C_data[0], C_desc ); - - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocC + mlocC + ldw); - std::vector worklange(std::max(mlocA, nlocA)); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, - A_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + + // Copy SLATE result back from GPU or CPU tiles. + copy(A, &A_data[0], A_desc); + copy(C, &C_data[0], C_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -259,13 +210,11 @@ void test_herk_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, - &Cref_data[0], 1, 1, Cref_desc, - &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm @@ -281,7 +230,7 @@ void test_herk_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_hesv.cc b/test/test_hesv.cc index 2e8f54af1..8dbe4b83c 100644 --- a/test/test_hesv.cc +++ b/test/test_hesv.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "grid_utils.hh" #include "print_matrix.hh" diff --git a/test/test_pocondest.cc b/test/test_pocondest.cc index 47d45109e..081e03e29 100644 --- a/test/test_pocondest.cc +++ b/test/test_pocondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_posv.cc b/test/test_posv.cc index 46f3c3cc6..398ac323b 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -9,11 +9,11 @@ #include "lapack/flops.hh" #include "print_matrix.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" -#include "scalapack_copy.hh" #include "auxiliary/Debug.hh" -#include "grid_utils.hh" #include #include @@ -33,8 +33,6 @@ void test_posv_work(Params& params, bool run) slate::Uplo uplo = params.uplo(); int64_t n = params.dim.n(); int64_t nrhs = params.nrhs(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); bool ref_only = params.ref() == 'o'; @@ -46,12 +44,14 @@ void test_posv_work(Params& params, bool run) int timer_level = params.timer_level(); slate::Origin origin = params.origin(); slate::Target target = params.target(); - slate::Dist dev_dist = params.dev_dist(); params.matrix.mark(); params.matrixB.mark(); slate::Method methodTrsm = params.method_trsm(); slate::Method methodHemm = params.method_hemm(); + mark_params_for_test_HermitianMatrix( params ); + mark_params_for_test_Matrix( params ); + // Currently only posv* supports timer_level >= 2. std::vector timer_lvl_support{ "posv", "posv_mixed", "posv_mixed_gmres" }; @@ -124,6 +124,11 @@ void test_posv_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -134,16 +139,6 @@ void test_posv_work(Params& params, bool run) {slate::Option::UseFallbackSolver, fallback}, }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - if (target != slate::Target::Devices && dev_dist != slate::Dist::Col) { - params.msg() = "skipping: dev_dist = Row applies only to target devices"; - return; - } - if ((params.routine == "posv_mixed" || params.routine == "posv_mixed_gmres") && ! std::is_same::value) { params.msg() = "skipping: unsupported mixed precision; must be type=d or z"; @@ -158,95 +153,31 @@ void test_posv_work(Params& params, bool run) int64_t info = 0; - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(nrhs, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // ScaLAPACK data if needed. - std::vector A_data, B_data; - - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::HermitianMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); - - slate::HermitianMatrix A; - slate::Matrix B, X; - std::vector X_data; - if (origin != slate::Origin::ScaLAPACK) { - if (dev_dist == slate::Dist::Row && target == slate::Target::Devices) { - // slate_assert(target == slate::Target::Devices); - // todo: doesn't work when lookahead is greater than 2 - // slate_assert(lookahead < 3); - - auto tileNb = slate::func::uniform_blocksize( n, nb ); - auto tileRank = slate::func::process_2d_grid( slate::GridOrder::Col, - p, q ); - int num_devices = blas::get_device_count(); - auto tileDevice = slate::func::device_1d_grid( slate::GridOrder::Col, - p, num_devices ); - - A = slate::HermitianMatrix( - uplo, n, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - B = slate::Matrix( - n, nrhs, tileNb, tileNb, tileRank, tileDevice, MPI_COMM_WORLD); - } - else { - A = slate::HermitianMatrix( - uplo, n, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix( - n, nrhs, nb, p, q, MPI_COMM_WORLD); - } - - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A.insertLocalTiles(origin_target); - - B.insertLocalTiles(origin_target); - - if (is_iterative) { - X_data.resize(lldB*nlocB); - X = slate::Matrix(n, nrhs, nb, p, q, MPI_COMM_WORLD); - X.insertLocalTiles(origin_target); - } - } - else { - // Create SLATE matrix from the ScaLAPACK layouts - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - A = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - n, nrhs, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - if (is_iterative) { - X_data.resize(lldB*nlocB); - X = slate::Matrix::fromScaLAPACK( - n, nrhs, &X_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto A_alloc = allocate_test_HermitianMatrix( check || ref, true, n, params ); + auto B_alloc = allocate_test_Matrix( check || ref, true, n, nrhs, params ); + TestMatrix> X_alloc; + if (is_iterative) { + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); } + auto& A = A_alloc.A; + auto& A_data = A_alloc.A_data; + auto& Aref = A_alloc.Aref; + auto& Aref_data = A_alloc.Aref_data; + auto& B = B_alloc.A; + auto& B_data = B_alloc.A_data; + auto& Bref = B_alloc.Aref; + auto& Bref_data = B_alloc.Aref_data; + auto& X = X_alloc.A; + slate::generate_matrix(params.matrix, A); slate::generate_matrix(params.matrixB, B); print_matrix("A", A, params); print_matrix("B", B, params); // if check is required, copy test data and create a descriptor for it - std::vector Aref_data(lldA*nlocA); - std::vector Bref_data(lldB*nlocB); std::vector B_orig; - slate::HermitianMatrix Aref; - slate::Matrix Bref; if (check || ref) { - // SLATE matrix wrappers for the reference data - Aref = slate::HermitianMatrix::fromScaLAPACK( - uplo, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - Bref = slate::Matrix::fromScaLAPACK( - n, nrhs, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - slate::copy( A, Aref ); slate::copy( B, Bref ); @@ -413,35 +344,12 @@ void test_posv_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // A comparison with a reference routine from ScaLAPACK for timing only - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], Aref_desc[9]; - blas_int B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Aref_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, n, nrhs, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + blas_int ictxt, Aref_desc[9], Bref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, Aref_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); if (check) { // restore Bref_data @@ -477,9 +385,13 @@ void test_posv_work(Params& params, bool run) if (verbose > 2) { if (origin == slate::Origin::ScaLAPACK) { - slate::Debug::diffLapackMatrices(n, n, &A_data[0], lldA, &Aref_data[0], lldA, nb, nb); + slate::Debug::diffLapackMatrices( + n, n, &A_data[0], A_alloc.lld, + &Aref_data[0], A_alloc.lld, nb, nb); if (params.routine != "potrf") { - slate::Debug::diffLapackMatrices(n, nrhs, &B_data[0], lldB, &Bref_data[0], lldB, nb, nb); + slate::Debug::diffLapackMatrices( + n, nrhs, &B_data[0], B_alloc.lld, + &Bref_data[0], B_alloc.lld, nb, nb); } } } @@ -487,7 +399,7 @@ void test_posv_work(Params& params, bool run) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_potri.cc b/test/test_potri.cc index 647181acd..07d926ffc 100644 --- a/test/test_potri.cc +++ b/test/test_potri.cc @@ -10,7 +10,6 @@ #include "print_matrix.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "grid_utils.hh" diff --git a/test/test_scale.cc b/test/test_scale.cc index a98ffcb2d..c04652e39 100644 --- a/test/test_scale.cc +++ b/test/test_scale.cc @@ -7,10 +7,9 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" #include "test_utils.hh" @@ -49,8 +48,6 @@ void test_scale_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -67,6 +64,11 @@ void test_scale_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; @@ -124,7 +126,7 @@ void test_scale_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::Max, A ); @@ -164,7 +166,7 @@ void test_scale_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_scale_row_col.cc b/test/test_scale_row_col.cc index 7c1ed4ffd..18e0819fd 100644 --- a/test/test_scale_row_col.cc +++ b/test/test_scale_row_col.cc @@ -7,10 +7,9 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" #include "test_utils.hh" @@ -57,6 +56,7 @@ void test_scale_row_col_work( Params& params, bool run ) bool trace = params.trace() == 'y'; int verbose = params.verbose(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); mark_params_for_test_Matrix( params ); @@ -68,6 +68,11 @@ void test_scale_row_col_work( Params& params, bool run ) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; @@ -138,7 +143,7 @@ void test_scale_row_col_work( Params& params, bool run ) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_max = slate::norm( slate::Norm::Max, A ); @@ -146,7 +151,7 @@ void test_scale_row_col_work( Params& params, bool run ) std::vector Rlocal( A_alloc.mloc ), Clocal( A_alloc.nloc ); int myrow, mycol; - gridinfo( A.mpiRank(), slate::GridOrder::Col, p, q, &myrow, &mycol ); + gridinfo( A.mpiRank(), grid_order, p, q, &myrow, &mycol ); // Copy local part of R. int64_t ii = 0, iilocal = 0; @@ -231,7 +236,7 @@ void test_scale_row_col_work( Params& params, bool run ) //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK SLATE_UNUSED( verbose ); - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_set.cc b/test/test_set.cc index 2dc088f0a..85eed4200 100644 --- a/test/test_set.cc +++ b/test/test_set.cc @@ -7,10 +7,9 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + #include "matrix_utils.hh" #include "test_utils.hh" @@ -49,8 +48,6 @@ void test_set_work(Params& params, bool run) else { n = params.dim.n(); } - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); bool ref_only = params.ref() == 'o'; bool ref = params.ref() == 'y' || ref_only; bool check = params.check() == 'y' && ! ref_only; @@ -67,6 +64,11 @@ void test_set_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; @@ -124,7 +126,7 @@ void test_set_work(Params& params, bool run) // initialize BLACS and ScaLAPACK blas_int ictxt, A_desc[9]; - create_ScaLAPACK_context( slate::GridOrder::Col, p, q, &ictxt ); + A_alloc.create_ScaLAPACK_context( &ictxt ); A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); real_t A_norm = slate::norm( slate::Norm::One, A ); @@ -160,7 +162,7 @@ void test_set_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_stedc.cc b/test/test_stedc.cc index b28271c10..73ff18ec9 100644 --- a/test/test_stedc.cc +++ b/test/test_stedc.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" #include "matrix_generator.hh" diff --git a/test/test_stedc_deflate.cc b/test/test_stedc_deflate.cc index 9ad2dedd8..077710af2 100644 --- a/test/test_stedc_deflate.cc +++ b/test/test_stedc_deflate.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_secular.cc b/test/test_stedc_secular.cc index 2e5aeabaf..e321bb8cd 100644 --- a/test/test_stedc_secular.cc +++ b/test/test_stedc_secular.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_sort.cc b/test/test_stedc_sort.cc index 96aa1d451..26eff3fd7 100644 --- a/test/test_stedc_sort.cc +++ b/test/test_stedc_sort.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_stedc_z_vector.cc b/test/test_stedc_z_vector.cc index bc8a5013f..64ca668a2 100644 --- a/test/test_stedc_z_vector.cc +++ b/test/test_stedc_z_vector.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_steqr2.cc b/test/test_steqr2.cc index 4d092669e..ff5ec29b9 100644 --- a/test/test_steqr2.cc +++ b/test/test_steqr2.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_sterf.cc b/test/test_sterf.cc index 1d0a7f2a5..5488ef7fe 100644 --- a/test/test_sterf.cc +++ b/test/test_sterf.cc @@ -7,7 +7,7 @@ #include "blas.hh" #include "test.hh" #include "print_matrix.hh" -#include "scalapack_support_routines.hh" + #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_svd.cc b/test/test_svd.cc index 2a1fb276b..36c4e9ba3 100644 --- a/test/test_svd.cc +++ b/test/test_svd.cc @@ -8,10 +8,11 @@ #include "blas/flops.hh" #include "lapack/flops.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include @@ -35,9 +36,6 @@ void test_svd_work( Params& params, bool run ) lapack::Job jobvt = params.jobvt(); int64_t m = params.dim.m(); int64_t n = params.dim.n(); - int64_t p = params.grid.m(); - int64_t q = params.grid.n(); - int64_t nb = params.nb(); int64_t ib = params.ib(); int64_t panel_threads = params.panel_threads(); int64_t lookahead = params.lookahead(); @@ -51,6 +49,10 @@ void test_svd_work( Params& params, bool run ) slate::Target target = params.target(); params.matrix.mark(); + mark_params_for_test_Matrix( params ); + // nonuniform nb is not always supported in the reduction to band + params.nonuniform_nb.used( false ); + params.time(); params.ref_time(); params.error2(); @@ -86,6 +88,11 @@ void test_svd_work( Params& params, bool run ) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + if (check && ! ref && (jobu == slate::Job::NoVec || jobvt == slate::Job::NoVec)) { params.msg() = "job = NoVec requires --ref y to check singular values"; @@ -98,48 +105,6 @@ void test_svd_work( Params& params, bool run ) {slate::Option::InnerBlocking, ib} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - int64_t min_mn = std::min(m, n); - - // Figure out local size. - // matrix A (local input), m-by-n - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - std::vector A_data; - std::vector Acpy_data; - - // U is either m-by-min( m, n ) for some vec, or m-by-m for all vec; - // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec. - int64_t Um = m; - int64_t Un = jobu == slate::Job::AllVec ? m : min_mn; - int64_t VTm = jobvt == slate::Job::AllVec ? n : min_mn; - int64_t VTn = n; - - // matrix U (local output), U(m, min_mn), singular values of A - int64_t mlocU = num_local_rows_cols( Um, nb, myrow, p ); - int64_t nlocU = num_local_rows_cols( Un, nb, mycol, q ); - int64_t lldU = blas::max(1, mlocU); // local leading dimension of U - std::vector U_data(1); - - // matrix VT (local output), VT(min_mn, n) - int64_t mlocVT = num_local_rows_cols( VTm, nb, myrow, p ); - int64_t nlocVT = num_local_rows_cols( VTn, nb, mycol, q ); - int64_t lldVT = blas::max(1, mlocVT); // local leading dimension of VT - std::vector VT_data(1); - - // array Sigma (global output), singular values of A - std::vector Sigma(min_mn); - - slate::Matrix A; // (m, n); - slate::Matrix U; // (m, min_mn); - slate::Matrix VT; // (min_mn, n); - slate::Matrix Acpy; - bool wantu = (jobu == slate::Job::Vec || jobu == slate::Job::AllVec || jobu == slate::Job::SomeVec); @@ -147,45 +112,28 @@ void test_svd_work( Params& params, bool run ) || jobvt == slate::Job::AllVec || jobvt == slate::Job::SomeVec); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); + int64_t min_mn = std::min(m, n); - Acpy = slate::Matrix(m, n, nb, p, q, MPI_COMM_WORLD); - Acpy.insertLocalTiles(origin_target); + // U is either m-by-min( m, n ) for some vec, or m-by-m for all vec; + // VT is either min( m, n )-by-n for some vec, or n-by-n for all vec. + int64_t Um = wantu ? m : 0; + int64_t Un = wantu ? (jobu == slate::Job::AllVec ? m : min_mn) : 0; + int64_t VTm = wantvt ? (jobvt == slate::Job::AllVec ? n : min_mn) : 0; + int64_t VTn = wantvt ? n : 0; - if (wantu) { - U = slate::Matrix( Um, Un, nb, p, q, MPI_COMM_WORLD ); - U.insertLocalTiles(origin_target); - } - if (wantvt) { - VT = slate::Matrix( VTm, VTn, nb, p, q, MPI_COMM_WORLD ); - VT.insertLocalTiles(origin_target); - } - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A_data.resize( lldA * nlocA ); - A = slate::Matrix::fromScaLAPACK( - m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + // array Sigma (global output), singular values of A + std::vector Sigma(min_mn); - Acpy_data.resize( lldA * nlocA ); - Acpy = slate::Matrix::fromScaLAPACK( - m, n, &Acpy_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + auto A_alloc = allocate_test_Matrix( check || ref, true, m, n, params ); + auto U_alloc = allocate_test_Matrix( false, true, Um, Un, params ); + auto VT_alloc = allocate_test_Matrix( false, true, VTm, VTn, params ); + // TODO Acpy isn't always needed + auto Acpy_alloc = allocate_test_Matrix( false, true, m, n, params ); - if (wantu) { - U_data.resize(lldU*nlocU); - U = slate::Matrix::fromScaLAPACK( - Um, Un, &U_data[0], lldU, nb, p, q, MPI_COMM_WORLD ); - } - if (wantvt) { - VT_data.resize(lldVT*nlocVT); - VT = slate::Matrix::fromScaLAPACK( - VTm, VTn, &VT_data[0], lldVT, nb, p, q, MPI_COMM_WORLD ); - } - } + auto& A = A_alloc.A; + auto& U = U_alloc.A; + auto& VT = VT_alloc.A; + auto& Acpy = Acpy_alloc.A; if (verbose >= 1) { printf( "%% A %6lld-by-%6lld\n", llong( A.m() ), llong( A.n() ) ); @@ -201,15 +149,10 @@ void test_svd_work( Params& params, bool run ) slate::generate_matrix( params.matrix, A); print_matrix( "A", A, params ); - slate::Matrix Aref; std::vector Sigma_ref; - std::vector Aref_data; if (check || ref) { Sigma_ref.resize( min_mn ); - Aref_data.resize( lldA * nlocA ); - Aref = slate::Matrix::fromScaLAPACK( - m, n, &Aref_data[0], lldA, nb, p, q, MPI_COMM_WORLD ); - slate::copy( A, Aref ); + slate::copy( A, A_alloc.Aref ); slate::copy( A, Acpy ); } @@ -272,8 +215,8 @@ void test_svd_work( Params& params, bool run ) Rm = blas::max( Rm, m ); if (jobvt == slate::Job::AllVec) Rm = blas::max( Rm, n ); - slate::Matrix R( Rm, Rm, nb, p, q, MPI_COMM_WORLD ); - R.insertLocalTiles(); + auto R_alloc = allocate_test_Matrix( false, true, Rm, Rm, params ); + auto R = R_alloc.A; if (wantu) { //================================================== @@ -344,42 +287,21 @@ void test_svd_work( Params& params, bool run ) #ifdef SLATE_HAVE_SCALAPACK // Run reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - blas_int A_desc[9]; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - blas_int U_desc[9]; - scalapack_descinit(U_desc, m, min_mn, nb, nb, 0, 0, ictxt, mlocU, &info); - slate_assert(info == 0); - - blas_int VT_desc[9]; - scalapack_descinit(VT_desc, min_mn, n, nb, nb, 0, 0, ictxt, mlocVT, &info); - slate_assert(info == 0); - - // Allocate U and VT if not already allocated. - // If origin=scalapack, just overwrite SLATE's U and VT. - if (wantu) { - U_data.resize( lldU * nlocU ); - } - if (wantvt) { - VT_data.resize( lldVT * nlocVT ); + blas_int ictxt, A_desc[9], U_desc[9], VT_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + U_alloc.ScaLAPACK_descriptor( ictxt, U_desc ); + VT_alloc.ScaLAPACK_descriptor( ictxt, VT_desc ); + + auto& Aref_data = A_alloc.Aref_data; + auto& U_data = U_alloc.A_data; + auto& VT_data = VT_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK) { + U_data.resize( U_alloc.lld * U_alloc.nloc ); + VT_data.resize( VT_alloc.lld * VT_alloc.nloc ); } // ScaLAPACK uses job = N and V (same as S); @@ -444,7 +366,7 @@ void test_svd_work( Params& params, bool run ) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_symm.cc b/test/test_symm.cc index c80dd3d98..8d6ebed19 100644 --- a/test/test_symm.cc +++ b/test/test_symm.cc @@ -6,11 +6,13 @@ #include "slate/slate.hh" #include "test.hh" #include "blas/flops.hh" +#include "print_matrix.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -35,10 +37,7 @@ void test_symm_work(Params& params, bool run) int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -50,6 +49,9 @@ void test_symm_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -63,6 +65,11 @@ void test_symm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -73,102 +80,48 @@ void test_symm_work(Params& params, bool run) // sizes of data int64_t An = (side == slate::Side::Left ? m : n); - int64_t Am = An; int64_t Bm = m; int64_t Bn = n; int64_t Cm = m; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } + auto A_alloc = allocate_test_SymmetricMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_Matrix( ref, true, Cm, Cn, params ); - slate::SymmetricMatrix A; - slate::Matrix B, C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::SymmetricMatrix(uplo, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::Matrix(Cm, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A = slate::SymmetricMatrix::fromScaLAPACK( - uplo, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::Matrix::fromScaLAPACK( - Cm, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A); slate::generate_matrix( params.matrixB, B); slate::generate_matrix( params.matrixC, C); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data and create a descriptor for it. - slate::Matrix Cref; - std::vector Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::Matrix::fromScaLAPACK( - Cm, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif - - if (side == slate::Side::Left) - slate_assert(A.mt() == C.mt()); - else - slate_assert(A.mt() == C.nt()); - slate_assert(B.mt() == C.mt()); - slate_assert(B.nt() == C.nt()); + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); - if (trace) slate::trace::Trace::on(); - else slate::trace::Trace::off(); + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); if (side == slate::Side::Left ) { // Compute Y = alpha A * (B * X) + (beta C * X). @@ -192,6 +145,16 @@ void test_symm_work(Params& params, bool run) throw slate::Exception("unknown side"); } + if (side == slate::Side::Left) + slate_assert(A.mt() == C.mt()); + else + slate_assert(A.mt() == C.nt()); + slate_assert(B.mt() == C.mt()); + slate_assert(B.nt() == C.nt()); + + if (trace) slate::trace::Trace::on(); + else slate::trace::Trace::off(); + double time = barrier_get_wtime(MPI_COMM_WORLD); //================================================== @@ -218,6 +181,9 @@ void test_symm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -236,56 +202,31 @@ void test_symm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. copy(A, &A_data[0], A_desc); copy(B, &B_data[0], B_desc); copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - size_t ldw = nb*ceildiv( ceildiv( mlocA, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocA + mlocA + ldw); - std::vector worklange(std::max({mlocC, nlocC, mlocB, nlocB})); - - // get norms of the original data - real_t A_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), An, - &A_data[0], 1, 1, A_desc, &worklansy[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, - B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -297,12 +238,13 @@ void test_symm_work(Params& params, bool run) MPI_Barrier(MPI_COMM_WORLD); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); + + print_matrix( "Diff", C, params ); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plange(norm2str(norm), Cm, Cn, &Cref_data[0], - 1, 1, Cref_desc, &worklange[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(An) + 2) * std::abs(alpha) * A_norm * B_norm @@ -317,7 +259,7 @@ void test_symm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_synorm.cc b/test/test_synorm.cc index 0976e59c5..eb3dfc025 100644 --- a/test/test_synorm.cc +++ b/test/test_synorm.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -36,12 +37,17 @@ void test_synorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_SymmetricMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -49,44 +55,26 @@ void test_synorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); + auto A_alloc = allocate_test_SymmetricMatrix( ref_copy, false, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } - - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::SymmetricMatrix A0(uplo, n, nb, p, q, MPI_COMM_WORLD); + slate::generate_matrix(params.matrix, A); - slate::SymmetricMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::SymmetricMatrix(uplo, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::SymmetricMatrix::fromScaLAPACK( - uplo, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix(params.matrix, A); - print_matrix("A", A, params); if (trace) slate::trace::Trace::on(); @@ -107,40 +95,28 @@ void test_synorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, lldA, &info); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); - } - - if (check || ref) { + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); + } + // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), + int64_t ldw = nb*ceildiv( ceildiv( A_alloc.nloc, nb ), scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; - std::vector worklansy(lwork); + int64_t lwork = 2*A_alloc.mloc + A_alloc.nloc + ldw; + std::vector worklansy( lwork ); //================================================== // Run ScaLAPACK reference routine. @@ -151,10 +127,6 @@ void test_synorm_work(Params& params, bool run) n, &A_data[0], 1, 1, A_desc, &worklansy[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lansy( - // norm, A.uplo(), - // n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One || norm == slate::Norm::Inf) { @@ -164,7 +136,7 @@ void test_synorm_work(Params& params, bool run) error /= n; // = sqrt( n*n ); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -182,140 +154,131 @@ void test_synorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - // allocate work space - int64_t ldw = nb*ceildiv( ceildiv( nlocA, nb ), - scalapack_ilcm( p, q ) / p ); - int64_t lwork = 2*mlocA + nlocA + ldw; - std::vector worklansy(lwork); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if nt is small, so check in loops. - int64_t nt = A.nt(); - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - j_indices.insert(rand() % nt); - } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : j_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) - continue; - int64_t ib = std::min(n - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); + } + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if nt is small, so check in loops. + int64_t nt = A.nt(); + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : j_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= nt || (uplo == slate::Uplo::Lower ? i < j : i > j)) continue; - } - - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } - - A_norm = slate::norm(norm, A, opts); - - real_t A_norm_ref = scalapack_plansy( - norm2str(norm), uplo2str(A.uplo()), - n, &A_data[0], 1, 1, A_desc, &worklansy[0]); - - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One || norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(n*n); - } - - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan. - bool okay = (std::isnan(real(peak)) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real( peak ), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + int64_t ib = std::min(n - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) + continue; + + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } + + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } + + A_norm = slate::norm(norm, A, opts); + + A_norm_ref = scalapack_plansy( + norm2str(norm), uplo2str(A.uplo()), + n, &A_data[0], 1, 1, A_desc, &worklansy[0]); + + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One || norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(n*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan. + bool okay = (std::isnan(real(peak)) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real( peak ), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } + } + + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } - - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } } } } } - } - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if ((check || ref) && mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - #endif + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- diff --git a/test/test_syr2k.cc b/test/test_syr2k.cc index 2ae006ce1..cf051cabd 100644 --- a/test/test_syr2k.cc +++ b/test/test_syr2k.cc @@ -8,10 +8,11 @@ #include "print_matrix.hh" #include "blas/flops.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,10 +39,7 @@ void test_syr2k_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -53,6 +51,9 @@ void test_syr2k_work(Params& params, bool run) params.matrixB.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -66,6 +67,11 @@ void test_syr2k_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -79,77 +85,30 @@ void test_syr2k_work(Params& params, bool run) int64_t An = (trans == slate::Op::NoTrans ? k : n); int64_t Bm = Am; int64_t Bn = An; - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - C_data.resize( lldC * nlocC ); - } + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto B_alloc = allocate_test_Matrix( false, true, Bm, Bn, params ); + auto C_alloc = allocate_test_SymmetricMatrix( ref, true, Cn, params ); - slate::Matrix A, B; - slate::SymmetricMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - - C = slate::SymmetricMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - C = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - slate::SymmetricMatrix Cref; - std::vector< scalar_t > Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - print_matrix("Initial Cref", Cref, params); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + B_norm = slate::norm(norm, B); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A and B matrices, // and make a shallow copy of them for transposing. @@ -175,17 +134,19 @@ void test_syr2k_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = (alpha A (B^T X)) + alpha B (A^T X)) + (beta C X). // Y = beta C X @@ -224,6 +185,9 @@ void test_syr2k_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -242,54 +206,31 @@ void test_syr2k_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - copy( C, &C_data[0], C_desc ); + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy( 2*nlocC + mlocC + ldw ); - std::vector worklange( blas::max( mlocA, mlocB, nlocA, nlocB ) ); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]); - real_t B_norm = scalapack_plange(norm2str(norm), Bm, Bn, &B_data[0], 1, 1, B_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -302,11 +243,11 @@ void test_syr2k_work(Params& params, bool run) print_matrix("Cref", Cref, params); - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(2*k) + 2) * std::abs(alpha) * A_norm * B_norm @@ -323,7 +264,7 @@ void test_syr2k_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_syrk.cc b/test/test_syrk.cc index 03d6de765..bae13cf10 100644 --- a/test/test_syrk.cc +++ b/test/test_syrk.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "blas/flops.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -35,10 +36,7 @@ void test_syrk_work(Params& params, bool run) int64_t k = params.dim.k(); scalar_t alpha = params.alpha.get(); scalar_t beta = params.beta.get(); - int p = params.grid.m(); - int q = params.grid.n(); int64_t nrhs = params.nrhs(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -49,6 +47,9 @@ void test_syrk_work(Params& params, bool run) params.matrix.mark(); params.matrixC.mark(); + mark_params_for_test_SymmetricMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -62,6 +63,11 @@ void test_syrk_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -73,64 +79,26 @@ void test_syrk_work(Params& params, bool run) // setup so op(A) is n-by-k int64_t Am = (transA == slate::Op::NoTrans ? n : k); int64_t An = (transA == slate::Op::NoTrans ? k : n); - int64_t Cm = n; int64_t Cn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix C: figure out local size. - int64_t mlocC = num_local_rows_cols(Cm, nb, myrow, p); - int64_t nlocC = num_local_rows_cols(Cn, nb, mycol, q); - int64_t lldC = blas::max(1, mlocC); // local leading dimension of C - - // Allocate ScaLAPACK data if needed. - std::vector A_data, C_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - C_data.resize( lldC * nlocC ); - } + auto A_alloc = allocate_test_Matrix( false, true, Am, An, params ); + auto C_alloc = allocate_test_SymmetricMatrix( ref, true, Cn, params ); - slate::Matrix A; - slate::SymmetricMatrix C; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::Matrix(Am, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - C = slate::SymmetricMatrix(uplo, Cn, nb, p, q, MPI_COMM_WORLD); - C.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::Matrix::fromScaLAPACK( - Am, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - C = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &C_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& C = C_alloc.A; + auto& Cref = C_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixC, C ); - #ifdef SLATE_HAVE_SCALAPACK - // If reference run is required, copy test data. - slate::SymmetricMatrix Cref; - std::vector Cref_data; - if (check || ref) { - Cref_data.resize( lldC * nlocC ); - Cref = slate::SymmetricMatrix::fromScaLAPACK( - uplo, Cn, &Cref_data[0], lldC, nb, p, q, MPI_COMM_WORLD); - slate::copy( C, Cref ); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, C_orig_norm=0; + if (ref) { + slate::copy( C, Cref ); + + A_norm = slate::norm(norm, A); + C_orig_norm = slate::norm(norm, Cref); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -145,17 +113,19 @@ void test_syrk_work(Params& params, bool run) else slate::trace::Trace::off(); // If check run, perform first half of SLATE residual check. - slate::Matrix X, Y, Z; + TestMatrix> X_alloc, Y_alloc, Z_alloc; if (check && ! ref) { - X = slate::Matrix( An, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - Y = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); - Z = slate::Matrix( Am, nrhs, nb, p, q, MPI_COMM_WORLD); - Z.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, An, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + Z_alloc = allocate_test_Matrix( false, true, Am, nrhs, params ); + + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + auto& Z = Z_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); - generate_matrix( mp, X ); + slate::generate_matrix( mp, X ); // Compute Y = alpha A (A^T X) + (beta C X). // Y = beta C X @@ -187,6 +157,9 @@ void test_syrk_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, C*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -205,49 +178,27 @@ void test_syrk_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], C_desc[9], Cref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(C_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); - - scalapack_descinit(Cref_desc, Cm, Cn, nb, nb, 0, 0, ictxt, mlocC, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], C_desc[9], Cref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, C_desc ); + C_alloc.ScaLAPACK_descriptor( ictxt, Cref_desc ); + + auto& A_data = A_alloc.A_data; + auto& C_data = C_alloc.A_data; + auto& Cref_data = C_alloc.Aref_data; if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + C_data.resize( C_alloc.lld * C_alloc.nloc ); + // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( C, &C_data[0], C_desc ); + copy(A, &A_data[0], A_desc); + copy(C, &C_data[0], C_desc); } - // allocate workspace for norms - int64_t ldw = nb*ceildiv( ceildiv( mlocC, nb ), - scalapack_ilcm( p, q ) / p ); - std::vector worklansy(2*nlocC + mlocC + ldw); - std::vector worklange(std::max(mlocA, nlocA)); - - // get norms of the original data - real_t A_norm = scalapack_plange(norm2str(norm), Am, An, &A_data[0], 1, 1, A_desc, &worklange[0]); - real_t C_orig_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); - //================================================== // Run ScaLAPACK reference routine. //================================================== @@ -257,11 +208,11 @@ void test_syrk_work(Params& params, bool run) &Cref_data[0], 1, 1, Cref_desc); time = barrier_get_wtime( MPI_COMM_WORLD ) - time; - // local operation: error = Cref_data - C_data - blas::axpy(Cref_data.size(), -1.0, &C_data[0], 1, &Cref_data[0], 1); + // get differences C = C - Cref + slate::add(-one, Cref, one, C); - // norm(Cref_data - C_data) - real_t C_diff_norm = scalapack_plansy(norm2str(norm), uplo2str(uplo), Cn, &Cref_data[0], 1, 1, Cref_desc, &worklansy[0]); + // norm(C - Cref) + real_t C_diff_norm = slate::norm(norm, C); real_t error = C_diff_norm / (sqrt(real_t(k) + 2) * std::abs(alpha) * A_norm * A_norm @@ -277,7 +228,7 @@ void test_syrk_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_tb2bd.cc b/test/test_tb2bd.cc index a1ab1aafa..29204bda2 100644 --- a/test/test_tb2bd.cc +++ b/test/test_tb2bd.cc @@ -8,7 +8,6 @@ #include "test.hh" #include "print_matrix.hh" #include "grid_utils.hh" -#include "scalapack_support_routines.hh" #include #include diff --git a/test/test_tbsm.cc b/test/test_tbsm.cc index 2442245b4..b7204bb90 100644 --- a/test/test_tbsm.cc +++ b/test/test_tbsm.cc @@ -8,7 +8,6 @@ #include "blas/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "print_matrix.hh" #include "band_utils.hh" #include "grid_utils.hh" diff --git a/test/test_trcondest.cc b/test/test_trcondest.cc index 74fbac410..1301854fb 100644 --- a/test/test_trcondest.cc +++ b/test/test_trcondest.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_trmm.cc b/test/test_trmm.cc index 26e0b3b64..a8b6d08e6 100644 --- a/test/test_trmm.cc +++ b/test/test_trmm.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "blas/flops.hh" +#include "matrix_utils.hh" +#include "test_utils.hh" + #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "grid_utils.hh" #include #include @@ -38,9 +39,6 @@ void test_trmm_work(Params& params, bool run) int64_t nrhs = params.nrhs(); scalar_t alpha = params.alpha.get(); slate::Op transB = params.transB(); - int p = params.grid.m(); - int q = params.grid.n(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -51,6 +49,9 @@ void test_trmm_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_TriangularMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -65,6 +66,11 @@ void test_trmm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target} @@ -79,60 +85,24 @@ void test_trmm_work(Params& params, bool run) int64_t Bm = (transB == slate::Op::NoTrans ? m : n); int64_t Bn = (transB == slate::Op::NoTrans ? n : m); - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - B_data.resize( lldB * nlocB ); - } + auto A_alloc = allocate_test_TriangularMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( ref, true, Bm, Bn, params ); - slate::TriangularMatrix A; - slate::Matrix B; - slate::Target origin_target = origin2target(origin); - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - A = slate::TriangularMatrix(uplo, diag, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix(Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrices from the ScaLAPACK layouts. - A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& Bref = B_alloc.Aref; generate_matrix( params.matrix, A ); generate_matrix( params.matrixB, B ); - #ifdef SLATE_HAVE_SCALAPACK - // if reference run is required, copy test data. - std::vector Bref_data; - if (ref) { - Bref_data.resize( lldB * nlocB ); - auto Bref = slate::Matrix::fromScaLAPACK( - Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - slate::copy( B, Bref ); - } - #endif + // If reference run is required, record norms to be used in the check/ref. + real_t A_norm=0, B_orig_norm=0; + if (ref) { + slate::copy( B, Bref ); + + A_norm = slate::norm(norm, A); + B_orig_norm = slate::norm(norm, B); + } // Keep the original untransposed A matrix, // and make a shallow copy of it for transposing. @@ -148,14 +118,16 @@ void test_trmm_work(Params& params, bool run) B = conj_transpose( B ); // If check run, perform first half of SLATE residual check. - slate::Matrix X, X2, Y; + TestMatrix> X_alloc, X2_alloc, Y_alloc; if (check && ! ref) { - X = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X.insertLocalTiles(origin_target); - X2 = slate::Matrix( n, nrhs, nb, p, q, MPI_COMM_WORLD ); - X2.insertLocalTiles(origin_target); - Y = slate::Matrix( m, nrhs, nb, p, q, MPI_COMM_WORLD ); - Y.insertLocalTiles(origin_target); + X_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + X2_alloc = allocate_test_Matrix( false, true, n, nrhs, params ); + Y_alloc = allocate_test_Matrix( false, true, m, nrhs, params ); + + auto& X = X_alloc.A; + auto& X2 = X2_alloc.A; + auto& Y = Y_alloc.A; + MatrixParams mp; mp.kind.set_default( "rand" ); generate_matrix( mp, X ); @@ -207,6 +179,9 @@ void test_trmm_work(Params& params, bool run) params.gflops() = gflop / time; if (check && ! ref) { + auto& X = X_alloc.A; + auto& Y = Y_alloc.A; + // SLATE residual check. // Check error, B*X - Y. real_t y_norm = slate::norm( norm, Y, opts ); @@ -224,48 +199,27 @@ void test_trmm_work(Params& params, bool run) if (ref) { #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank == mpi_rank_ ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert(p == p_ && q == q_); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); + blas_int ictxt, A_desc[9], B_desc[9], Bref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); - if (origin != slate::Origin::ScaLAPACK) { - // Copy SLATE result back from GPU or CPU tiles. - copy( A, &A_data[0], A_desc ); - copy( B, &B_data[0], B_desc ); - } + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, B_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); - // allocate workspace for norms - std::vector worklantr(std::max(mlocA, nlocA)); - std::vector worklange(std::max(mlocB, nlocB)); + auto& A_data = A_alloc.A_data; + auto& B_data = B_alloc.A_data; + auto& Bref_data = B_alloc.Aref_data; - // get norms of the original data - real_t A_norm = scalapack_plantr( - norm2str(norm), uplo2str(uplo), diag2str(diag), Am, An, &A_data[0], - 1, 1, A_desc, &worklantr[0]); - real_t B_orig_norm = scalapack_plange( - norm2str(norm), Bm, Bn, &Bref_data[0], 1, 1, B_desc, &worklange[0]); + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + B_data.resize( B_alloc.lld * B_alloc.nloc ); + + // Copy SLATE matrix into ScaLAPACK matrix + copy(A, &A_data[0], A_desc); + copy(B, &B_data[0], B_desc); + } //================================================== // Run ScaLAPACK reference routine. @@ -277,12 +231,11 @@ void test_trmm_work(Params& params, bool run) &Bref_data[0], 1, 1, Bref_desc); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - // Local operation: error = Bref_data - B_data - blas::axpy(Bref_data.size(), -1.0, &B_data[0], 1, &Bref_data[0], 1); + // get differences B = B - Bref + slate::add(-one, Bref, one, B); - // norm(Bref_data - B_data) - real_t B_diff_norm = scalapack_plange(norm2str(norm), Bm, Bn, &Bref_data[0], - 1, 1, Bref_desc, &worklange[0]); + // norm(B - Bref) + real_t B_diff_norm = slate::norm(norm, B); real_t error = B_diff_norm / (sqrt(real_t(Am) + 2) * std::abs(alpha) * A_norm * B_orig_norm); @@ -298,7 +251,7 @@ void test_trmm_work(Params& params, bool run) Cblacs_gridexit(ictxt); //Cblacs_exit(1) does not handle re-entering #else // not SLATE_HAVE_SCALAPACK - if (mpi_rank == 0) + if (A.mpiRank() == 0) printf( "ScaLAPACK not available\n" ); #endif } diff --git a/test/test_trnorm.cc b/test/test_trnorm.cc index 8262e6ac9..bd8d87f67 100644 --- a/test/test_trnorm.cc +++ b/test/test_trnorm.cc @@ -7,10 +7,11 @@ #include "test.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" -#include "grid_utils.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include #include @@ -38,12 +39,17 @@ void test_trnorm_work(Params& params, bool run) bool check = params.check() == 'y'; bool ref = params.ref() == 'y'; bool trace = params.trace() == 'y'; + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + bool ref_copy = nonuniform_nb && (check || ref); int verbose = params.verbose(); int extended = params.extended(); slate::Origin origin = params.origin(); slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); params.matrix.mark(); + mark_params_for_test_TrapezoidMatrix( params ); + // mark non-standard output values params.time(); params.ref_time(); @@ -51,55 +57,26 @@ void test_trnorm_work(Params& params, bool run) if (! run) return; + // Check for common invalid combinations + if (is_invalid_parameters( params, true )) { + return; + } + slate::Options const opts = { {slate::Option::Target, target} }; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // upper requires m <= n, - // lower requires m >= n - if ((uplo == slate::Uplo::Lower && m < n) || - (uplo == slate::Uplo::Upper && m > n)) { - char buf[255]; - snprintf( buf, sizeof( buf ), "skipping invalid size: %s, %lld-by-%lld", - uplo2str( uplo ), llong( m ), llong( n ) ); - params.msg() = buf; - return; - } + auto A_alloc = allocate_test_TrapezoidMatrix( ref_copy, false, m, n, params ); - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(m, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + auto& A = A_alloc.A; + auto& Aref = A_alloc.Aref; - // Allocate ScaLAPACK data if needed. - std::vector A_data; - if (origin == slate::Origin::ScaLAPACK || check || ref || extended ) { - A_data.resize( lldA * nlocA ); - } + slate::generate_matrix(params.matrix, A); - // todo: work-around to initialize BaseMatrix::num_devices_ - slate::TrapezoidMatrix A0(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD); - - slate::TrapezoidMatrix A; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::TrapezoidMatrix(uplo, diag, m, n, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - } - else { - // Create SLATE matrix from the ScaLAPACK layout. - A = slate::TrapezoidMatrix::fromScaLAPACK( - uplo, diag, m, n, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); + if (ref_copy) { + copy_matrix( A, Aref ); } - slate::generate_matrix( params.matrix, A ); - print_matrix("A", A, params); if (trace) @@ -123,39 +100,27 @@ void test_trnorm_work(Params& params, bool run) // compute and save timing/performance params.time() = time; - #ifdef SLATE_HAVE_SCALAPACK - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, m, n, nb, nb, 0, 0, ictxt, lldA, &info); - if (info != 0) - printf( "scalapack_descinit info %lld\n", llong( info ) ); - slate_assert(info == 0); - - if (origin != slate::Origin::ScaLAPACK && (check || ref || extended)) { - copy( A, &A_data[0], A_desc ); - } - - if (check || ref) { + if (check || ref) { + #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK + // initialize BLACS and ScaLAPACK + blas_int ictxt, A_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + + auto& A_data = ref_copy ? A_alloc.Aref_data : A_alloc.A_data; + + if (origin != slate::Origin::ScaLAPACK && !ref_copy) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + copy(A, &A_data[0], A_desc); + } + // allocate work space - std::vector worklantr(std::max(mlocA, nlocA)); + std::vector worklantr( std::max(A_alloc.mloc, A_alloc.nloc) ); + + // comparison with reference routine from ScaLAPACK //================================================== // Run ScaLAPACK reference routine. @@ -166,10 +131,6 @@ void test_trnorm_work(Params& params, bool run) m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); time = barrier_get_wtime(MPI_COMM_WORLD) - time; - //A_norm_ref = lapack::lantr( - // norm, A.uplo(), diag, - // m, n, &A_data[0], lldA); - // difference between norms real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; if (norm == slate::Norm::One) { @@ -182,7 +143,7 @@ void test_trnorm_work(Params& params, bool run) error /= sqrt(m*n); } - if (verbose && mpi_rank == 0) { + if (verbose && A.mpiRank() == 0) { printf("norm %15.8e, ref %15.8e, ref - norm %5.2f, error %9.2e\n", A_norm, A_norm_ref, A_norm_ref - A_norm, error); } @@ -200,147 +161,141 @@ void test_trnorm_work(Params& params, bool run) // Allow for difference params.okay() = (params.error() <= tol); - } - //---------- extended tests - if (extended) { - // allocate work space - std::vector worklantr(std::max(mlocA, nlocA)); - - // seed all MPI processes the same - srand(1234); - - // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, - // up to 64 tiles total. - // Indices may be out-of-bounds if mt or nt is small, so check in loops. - int64_t mt = A.mt(); - int64_t nt = A.nt(); - std::set i_indices = { 0, 1, mt - 2, mt - 1 }; - std::set j_indices = { 0, 1, nt - 2, nt - 1 }; - for (size_t k = 0; k < 4; ++k) { - i_indices.insert(rand() % mt); - j_indices.insert(rand() % nt); - } - for (auto j : j_indices) { - if (j < 0 || j >= nt) - continue; - int64_t jb = std::min(n - j*nb, nb); - slate_assert(jb == A.tileNb(j)); - - for (auto i : i_indices) { - // lower requires i >= j - // upper requires i <= j - if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) - continue; - int64_t ib = std::min(m - i*nb, nb); - slate_assert(ib == A.tileMb(i)); - - // Test entries in 2x2 in all 4 corners, and 1 other random row and col, - // up to 25 entries per tile. - // Indices may be out-of-bounds if ib or jb is small, so check in loops. - std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; - std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; - - // todo: complex peak - scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; - if (rand() < RAND_MAX / 2) - peak *= -1; - if (rand() < RAND_MAX / 20) - peak = nan(""); - scalar_t save = 0; - - for (auto jj : jj_indices) { - if (jj < 0 || jj >= jb) + //---------- extended tests + if (extended) { + if (grid_order != slate::GridOrder::Col) { + printf("WARNING: cannot do extended tests with row-major grid\n"); + } + else { + // seed all MPI processes the same + srand(1234); + + // Test tiles in 2x2 in all 4 corners, and 4 random rows and cols, + // up to 64 tiles total. + // Indices may be out-of-bounds if mt or nt is small, so check in loops. + int64_t mt = A.mt(); + int64_t nt = A.nt(); + std::set i_indices = { 0, 1, mt - 2, mt - 1 }; + std::set j_indices = { 0, 1, nt - 2, nt - 1 }; + for (size_t k = 0; k < 4; ++k) { + i_indices.insert(rand() % mt); + j_indices.insert(rand() % nt); + } + for (auto j : j_indices) { + if (j < 0 || j >= nt) continue; + int64_t jb = std::min(n - j*nb, nb); + slate_assert(jb == A.tileNb(j)); - for (auto ii : ii_indices) { - if (ii < 0 || ii >= ib - || (i == j && (uplo == slate::Uplo::Lower - ? ii < jj - : ii > jj))) { + for (auto i : i_indices) { + // lower requires i >= j + // upper requires i <= j + if (i < 0 || i >= mt || (uplo == slate::Uplo::Lower ? i < j : i > j)) continue; - } - - int64_t ilocal = int(i / p)*nb + ii; - int64_t jlocal = int(j / q)*nb + jj; - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - save = T(ii, jj); - slate_assert(A_data[ ilocal + jlocal*lldA ] == save); - T.at(ii, jj) = peak; - A_data[ ilocal + jlocal*lldA ] = peak; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } - - A_norm = slate::norm(norm, A, opts); - - real_t A_norm_ref = scalapack_plantr( - norm2str(norm), uplo2str(A.uplo()), diag2str(diag), - m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); - - // difference between norms - real_t error = std::abs(A_norm - A_norm_ref) / A_norm_ref; - if (norm == slate::Norm::One) { - error /= sqrt(m); - } - else if (norm == slate::Norm::Inf) { - error /= sqrt(n); - } - else if (norm == slate::Norm::Fro) { - error /= sqrt(m*n); - } - - // Allow for difference, except max norm in real should be exact. - real_t eps = std::numeric_limits::epsilon(); - real_t tol; - if (norm == slate::Norm::Max && ! slate::is_complex::value) - tol = 0; - else - tol = 10*eps; - - if (mpi_rank == 0) { - // if peak is nan, expect A_norm to be nan, - // except in Unit case with i == j and ii == jj, - // where peak shouldn't affect A_norm. - bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) - ? std::isnan(A_norm) - : error <= tol); - params.okay() = params.okay() && okay; - if (verbose || ! okay) { - printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", - llong( i ), llong( j ), llong( ii ), llong( jj ), - real(peak), A_norm, A_norm_ref, error, - (okay ? "pass" : "failed")); + int64_t ib = std::min(m - i*nb, nb); + slate_assert(ib == A.tileMb(i)); + + // Test entries in 2x2 in all 4 corners, and 1 other random row and col, + // up to 25 entries per tile. + // Indices may be out-of-bounds if ib or jb is small, so check in loops. + std::set ii_indices = { 0, 1, ib - 2, ib - 1, rand() % ib }; + std::set jj_indices = { 0, 1, jb - 2, jb - 1, rand() % jb }; + + // todo: complex peak + scalar_t peak = rand() / double(RAND_MAX)*1e6 + 1e6; + if (rand() < RAND_MAX / 2) + peak *= -1; + if (rand() < RAND_MAX / 20) + peak = nan(""); + scalar_t save = 0; + + for (auto jj : jj_indices) { + if (jj < 0 || jj >= jb) + continue; + + for (auto ii : ii_indices) { + if (ii < 0 || ii >= ib + || (i == j && (uplo == slate::Uplo::Lower + ? ii < jj + : ii > jj))) { + continue; + } + + int64_t ilocal = int(i / p)*nb + ii; + int64_t jlocal = int(j / q)*nb + jj; + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + save = T(ii, jj); + slate_assert(A_data[ ilocal + jlocal*A_alloc.lld ] == save); + T.at(ii, jj) = peak; + A_data[ ilocal + jlocal*A_alloc.lld ] = peak; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } + + A_norm = slate::norm(norm, A, opts); + + A_norm_ref = scalapack_plantr( + norm2str(norm), uplo2str(A.uplo()), diag2str(diag), + m, n, &A_data[0], 1, 1, A_desc, &worklantr[0]); + + // difference between norms + error = std::abs(A_norm - A_norm_ref) / A_norm_ref; + if (norm == slate::Norm::One) { + error /= sqrt(m); + } + else if (norm == slate::Norm::Inf) { + error /= sqrt(n); + } + else if (norm == slate::Norm::Fro) { + error /= sqrt(m*n); + } + + if (A.mpiRank() == 0) { + // if peak is nan, expect A_norm to be nan, + // except in Unit case with i == j and ii == jj, + // where peak shouldn't affect A_norm. + bool okay = (std::isnan(real(peak)) && ! (diag == slate::Diag::Unit && i == j && ii == jj) + ? std::isnan(A_norm) + : error <= tol); + params.okay() = params.okay() && okay; + if (verbose || ! okay) { + printf("i %5lld, j %5lld, ii %3lld, jj %3lld, peak %15.8e, norm %15.8e, ref %15.8e, error %9.2e, %s\n", + llong( i ), llong( j ), llong( ii ), llong( jj ), + real(peak), A_norm, A_norm_ref, error, + (okay ? "pass" : "failed")); + } + } + + if (A.tileIsLocal(i, j)) { + A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); + auto T = A(i, j); + T.at(ii, jj) = save; + A_data[ ilocal + jlocal*A_alloc.lld ] = save; + // todo: this move shouldn't be required -- the trnorm should copy data itself. + A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); + } } } - - if (A.tileIsLocal(i, j)) { - A.tileGetForWriting(i, j, slate::LayoutConvert::ColMajor); - auto T = A(i, j); - T.at(ii, jj) = save; - A_data[ ilocal + jlocal*lldA ] = save; - // todo: this move shouldn't be required -- the trnorm should copy data itself. - A.tileGetForWriting(i, j, A.tileDevice(i, j), slate::LayoutConvert::ColMajor); - } } } } } - } - - Cblacs_gridexit(ictxt); - //Cblacs_exit(1) does not handle re-entering - #else // not SLATE_HAVE_SCALAPACK - SLATE_UNUSED( A_norm ); - SLATE_UNUSED( check ); - SLATE_UNUSED( ref ); - SLATE_UNUSED( extended ); - SLATE_UNUSED( verbose ); - if (mpi_rank == 0) - printf( "ScaLAPACK not available\n" ); - #endif + + Cblacs_gridexit(ictxt); + //Cblacs_exit(1) does not handle re-entering + #else // not SLATE_HAVE_SCALAPACK + SLATE_UNUSED( A_norm ); + SLATE_UNUSED( check ); + SLATE_UNUSED( ref ); + SLATE_UNUSED( extended ); + SLATE_UNUSED( verbose ); + if (A.mpiRank() == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } } // ----------------------------------------------------------------------------- diff --git a/test/test_trsm.cc b/test/test_trsm.cc index 24e3d3360..d1b8affe4 100644 --- a/test/test_trsm.cc +++ b/test/test_trsm.cc @@ -6,12 +6,13 @@ #include "slate/slate.hh" #include "test.hh" #include "blas/flops.hh" +#include "print_matrix.hh" + +#include "matrix_utils.hh" +#include "test_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" -#include "print_matrix.hh" -#include "grid_utils.hh" #include #include @@ -44,9 +45,6 @@ void test_trsm_work(Params& params, bool run) int64_t m = params.dim.m(); int64_t n = params.dim.n(); scalar_t alpha = params.alpha.get(); - int p = params.grid.m(); - int q = params.grid.n(); - int64_t nb = params.nb(); int64_t lookahead = params.lookahead(); slate::Norm norm = params.norm(); bool check = params.check() == 'y'; @@ -58,6 +56,9 @@ void test_trsm_work(Params& params, bool run) params.matrix.mark(); params.matrixB.mark(); + mark_params_for_test_TriangularMatrix( params ); + mark_params_for_test_Matrix( params ); + // mark non-standard output values params.time(); params.gflops(); @@ -72,6 +73,11 @@ void test_trsm_work(Params& params, bool run) return; } + // Check for common invalid combinations + if (is_invalid_parameters( params )) { + return; + } + slate::Options const opts = { {slate::Option::Lookahead, lookahead}, {slate::Option::Target, target}, @@ -87,49 +93,12 @@ void test_trsm_work(Params& params, bool run) int64_t Bm = m; int64_t Bn = n; - // MPI variables - int mpi_rank, myrow, mycol; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - gridinfo(mpi_rank, p, q, &myrow, &mycol); - - // Matrix A: figure out local size. - int64_t mlocA = num_local_rows_cols(Am, nb, myrow, p); - int64_t nlocA = num_local_rows_cols(An, nb, mycol, q); - int64_t lldA = blas::max(1, mlocA); // local leading dimension of A - - // Matrix B: figure out local size. - int64_t mlocB = num_local_rows_cols(Bm, nb, myrow, p); - int64_t nlocB = num_local_rows_cols(Bn, nb, mycol, q); - int64_t lldB = blas::max(1, mlocB); // local leading dimension of B - - // Allocate ScaLAPACK data if needed. - std::vector A_data, B_data; - if (ref || origin == slate::Origin::ScaLAPACK) { - A_data.resize( lldA * nlocA ); - } + auto A_alloc = allocate_test_TriangularMatrix( false, true, An, params ); + auto B_alloc = allocate_test_Matrix( check || ref, true, Bm, Bn, params ); - slate::TriangularMatrix A; - slate::Matrix B; - if (origin != slate::Origin::ScaLAPACK) { - // SLATE allocates CPU or GPU tiles. - slate::Target origin_target = origin2target(origin); - A = slate::TriangularMatrix( - uplo, diag, An, nb, p, q, MPI_COMM_WORLD); - A.insertLocalTiles(origin_target); - - B = slate::Matrix( - Bm, Bn, nb, p, q, MPI_COMM_WORLD); - B.insertLocalTiles(origin_target); - } - else { - // create SLATE matrices from the ScaLAPACK layouts - A = slate::TriangularMatrix::fromScaLAPACK( - uplo, diag, An, &A_data[0], lldA, nb, p, q, MPI_COMM_WORLD); - - B_data.resize( lldB * nlocB ); - B = slate::Matrix::fromScaLAPACK( - Bm, Bn, &B_data[0], lldB, nb, p, q, MPI_COMM_WORLD); - } + auto& A = A_alloc.A; + auto& B = B_alloc.A; + auto& Bref = B_alloc.Aref; slate::generate_matrix( params.matrix, A ); slate::generate_matrix( params.matrixB, B ); @@ -140,13 +109,8 @@ void test_trsm_work(Params& params, bool run) auto AH = slate::HermitianMatrix( A ); slate::potrf( AH, opts ); - // if check is required, copy test data - std::vector< scalar_t > Bref_data; - slate::Matrix Bref; + // If reference run is required, record norms to be used in the check/ref. if (check || ref) { - Bref_data.resize( lldB * nlocB ); - Bref = slate::Matrix::fromScaLAPACK( - Bm, Bn, &Bref_data[0], lldB, nb, p, q, MPI_COMM_WORLD); slate::copy( B, Bref ); } @@ -220,34 +184,22 @@ void test_trsm_work(Params& params, bool run) #ifdef SLATE_HAVE_SCALAPACK // comparison with reference routine from ScaLAPACK for timing only - // BLACS/MPI variables - blas_int ictxt, p_, q_, myrow_, mycol_; - blas_int A_desc[9], B_desc[9], Bref_desc[9]; - blas_int mpi_rank_ = 0, nprocs = 1; - // initialize BLACS and ScaLAPACK - Cblacs_pinfo(&mpi_rank_, &nprocs); - slate_assert( mpi_rank_ == mpi_rank ); - slate_assert(p*q <= nprocs); - Cblacs_get(-1, 0, &ictxt); - Cblacs_gridinit(&ictxt, "Col", p, q); - Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); - slate_assert( p == p_ ); - slate_assert( q == q_ ); - slate_assert( myrow == myrow_ ); - slate_assert( mycol == mycol_ ); - - int64_t info; - scalapack_descinit(A_desc, Am, An, nb, nb, 0, 0, ictxt, mlocA, &info); - slate_assert(info == 0); - - scalapack_descinit(B_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - scalapack_descinit(Bref_desc, Bm, Bn, nb, nb, 0, 0, ictxt, mlocB, &info); - slate_assert(info == 0); - - copy( A, &A_data[0], A_desc ); + blas_int ictxt, A_desc[9], Bref_desc[9]; + A_alloc.create_ScaLAPACK_context( &ictxt ); + + A_alloc.ScaLAPACK_descriptor( ictxt, A_desc ); + B_alloc.ScaLAPACK_descriptor( ictxt, Bref_desc ); + + auto& A_data = A_alloc.A_data; + auto& Bref_data = B_alloc.Aref_data; + + if (origin != slate::Origin::ScaLAPACK) { + A_data.resize( A_alloc.lld * A_alloc.nloc ); + + // Copy SLATE matrix into ScaLAPACK matrix + copy(A, &A_data[0], A_desc); + } //================================================== // Run ScaLAPACK reference routine. diff --git a/test/test_trtri.cc b/test/test_trtri.cc index 2cd7f6daf..e0c0d9435 100644 --- a/test/test_trtri.cc +++ b/test/test_trtri.cc @@ -9,7 +9,6 @@ #include "lapack/flops.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include "print_matrix.hh" #include "grid_utils.hh" diff --git a/test/test_unmqr.cc b/test/test_unmqr.cc index d7270b303..87eff3ecf 100644 --- a/test/test_unmqr.cc +++ b/test/test_unmqr.cc @@ -11,7 +11,6 @@ #include "grid_utils.hh" #include "scalapack_wrappers.hh" -#include "scalapack_support_routines.hh" #include "scalapack_copy.hh" #include diff --git a/test/test_utils.hh b/test/test_utils.hh index a946411b0..bdae1d494 100644 --- a/test/test_utils.hh +++ b/test/test_utils.hh @@ -7,8 +7,59 @@ #define SLATE_TEST_UTILS_HH #include "slate/slate.hh" +#include "test.hh" -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ +/// Checks for common invalid parameter combinations +/// +/// @return true if the configuration should be skipped +/// +inline bool is_invalid_parameters(Params& params, bool keep_nonuniform_ref = false) +{ + slate::Origin origin = params.origin(); + slate::Target target = params.target(); + slate::GridOrder dev_order = params.dev_order(); + bool nonuniform_nb = params.nonuniform_nb() == 'y'; + + if (target != slate::Target::Devices && dev_order == slate::GridOrder::Col) { + params.msg() = "skipping: dev_order = Col applies only to target devices"; + return true; + } + + if (dev_order == slate::GridOrder::Col && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: dev_order = Col not supported with origin=ScaLAPACK"; + return true; + } + + if (nonuniform_nb && origin == slate::Origin::ScaLAPACK) { + params.msg() = "skipping: nonuniform tile not supported with origin=ScaLAPACK"; + return true; + } + + #ifdef SLATE_HAVE_SCALAPACK + if (!keep_nonuniform_ref && nonuniform_nb && params.ref() != 'n') { + params.msg() = "skipping reference: nonuniform tile not supported with ScaLAPACK"; + if (params.ref() == 'o') { + // If ref=='o', the user doesn't want to run SLATE version + return true; + } + else { + params.ref() = 'n'; + } + } + #else + // Can only run ref when we have ScaLAPACK + if (params.ref()) { + params.msg() = "skipping reference: ScaLAPACK not available"; + params.ref() = false; + } + #endif + + + return false; +} + +//------------------------------------------------------------------------------ /// Applies the operator thunk to each element of A and B to update B. /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the @@ -29,19 +80,25 @@ void matrix_iterator( int64_t B_mt = B.mt(); int64_t B_nt = B.nt(); - if constexpr (std::is_same>::value) { - int64_t A_j = 0, A_jj = 0; - for (int64_t B_j = 0; B_j < B_nt; ++B_j) { + constexpr bool is_general = std::is_same_v>; + assert( is_general == (A.uplo() == slate::Uplo::General) ); + bool is_upper = (A.uplo() == slate::Uplo::Upper); + + int64_t A_j = 0, A_jj = 0; + for (int64_t B_j = 0; B_j < B_nt; ++B_j) { - int64_t A_i = 0, A_ii = 0; - for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + int64_t A_i = 0, A_ii = 0; + for (int64_t B_i = 0; B_i < B_mt; ++B_i) { + + if (is_general || (is_upper ? (B_i <= B_j) : (B_i >= B_j))) { #pragma omp task shared(A, B) \ firstprivate( B_i, B_j, A_i, A_j, A_ii, A_jj ) { + int tag = A_i + A_j * A.mt(); if (B.tileIsLocal( B_i, B_j )) { A.tileRecv( A_i, A_j, A.tileRank( A_i, A_j ), - slate::Layout::ColMajor ); + slate::Layout::ColMajor, tag ); A.tileGetForReading( A_i, A_j, ColMajor ); B.tileGetForWriting( B_i, B_j, ColMajor ); @@ -56,40 +113,44 @@ void matrix_iterator( scalar_t const* TA_data = TA.data(); scalar_t* TB_data = TB.data(); for (int64_t jj = 0; jj < nb; ++jj) { - for (int64_t ii = 0; ii < mb; ++ii) { + int64_t ii_start = 0, ii_end = mb; + if (!is_general && B_i == B_j) { // diagonal tile + if (is_upper) + ii_end = std::min(jj+1, mb); + else + ii_start = jj; + } + for (int64_t ii = ii_start; ii < ii_end; ++ii) { thunk( TA_data[ (A_ii+ii) + (A_jj+jj)*lda ], TB_data[ ii + jj*ldb ] ); } } } else if (A.tileIsLocal( A_i, A_j )) { - A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ) ); + A.tileSend( A_i, A_j, B.tileRank( B_i, B_j ), tag ); } } - A_ii += B.tileMb( B_i ); - assert( A_ii <= A.tileMb( A_i ) ); - if (A_ii == A.tileMb( A_i )) { - ++A_i; - A_ii = 0; - } } - A_jj += B.tileNb( B_j ); - assert( A_jj <= A.tileNb( A_j ) ); - if (A_jj == A.tileNb( A_j )) { - ++A_j; - A_jj = 0; + A_ii += B.tileMb( B_i ); + assert( A_ii <= A.tileMb( A_i ) ); + if (A_ii == A.tileMb( A_i )) { + ++A_i; + A_ii = 0; } } - } - else { - assert( false ); + A_jj += B.tileNb( B_j ); + assert( A_jj <= A.tileNb( A_j ) ); + if (A_jj == A.tileNb( A_j )) { + ++A_j; + A_jj = 0; + } } A.releaseRemoteWorkspace(); } -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// subtract_matrices takes input matrices A and B, and performs B = B - A. /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the @@ -103,7 +164,7 @@ void subtract_matrices( matrix_type& A, matrix_type& B ) matrix_iterator( A, B, [](const scalar_t& a, scalar_t& b) { b -= a; } ); } -///----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ /// copy_matrix copies A to B /// The matrices must have the same size, but can have different tile sizes and /// distributions. However, the elements of a tile of B must all belong to the