Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nonuniform tile sizes, device distributions, and grid orders to even more testers (depends on #143) #157

Merged
merged 33 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3d6965e
Add nonuniform nb support to test_posv
neil-lindquist Dec 15, 2023
ac36b67
Refactor out some duplicate code for TestMatrix class
neil-lindquist Dec 15, 2023
72f871d
Add support for dev_dist to test_gesv
neil-lindquist Dec 15, 2023
9dfcbd0
Refactor out some common error checks in the tester
neil-lindquist Dec 15, 2023
8460bb8
Allow TestMatrix to create ScaLAPACK contexts to reduce duplication
neil-lindquist Dec 15, 2023
aa06be3
Improve thread safety of matrix_iterator
neil-lindquist Dec 15, 2023
2681a86
Change default device distribution value to match behavior
neil-lindquist Dec 15, 2023
8758c28
Add grid order to test/run_tests.py
neil-lindquist Dec 18, 2023
fa4a5c3
Add options to hemm and symm, plus fix some other minor issues
neil-lindquist Dec 18, 2023
0827271
Refactor out the handling of some reference warning messages
neil-lindquist Dec 18, 2023
08d43b5
Add options to sy/herk and sy/her2k
neil-lindquist Dec 18, 2023
799cd98
Consolidate some if statements
neil-lindquist Dec 18, 2023
8b00be4
Add options to trmm and trsm
neil-lindquist Dec 19, 2023
782c482
Move allocate_test_* functions to cc file
neil-lindquist Dec 19, 2023
cca1384
Deduplicate code in matrix allocate
neil-lindquist Dec 19, 2023
3783b19
Tighten some function signatures
neil-lindquist Dec 19, 2023
88b497e
Refactor some test/run_tests.py parameters into variables
neil-lindquist Dec 19, 2023
caffee3
FIXUP 9c933fcf5
neil-lindquist Dec 27, 2023
60421d0
Add options to norm testers
neil-lindquist Dec 20, 2023
6b7636b
Fix bug in henorm and synorm for non-uniform tile sizes
neil-lindquist Dec 21, 2023
5be01cf
Reduce code duplication in matrix_iterator
neil-lindquist Dec 21, 2023
31d84c3
Improve some warning messages
neil-lindquist Dec 26, 2023
8e00a46
Delete unused header
neil-lindquist Dec 27, 2023
faddc11
Cleanup some header include's
neil-lindquist Dec 27, 2023
3d3d152
Make norm testers more consistent
neil-lindquist Dec 27, 2023
778a866
Only do parameter check when doing actual test run
neil-lindquist Dec 27, 2023
ce53591
style
mgates3 Jan 23, 2024
3bc5090
test: add trans back to getrs_tntpiv, nopiv
mgates3 Jan 23, 2024
8d65640
factor nb out of {sy,he}norm loops; make {sy,he}norms more consistent
mgates3 Jan 24, 2024
6004184
remove unused vals_dev_arrays
mgates3 Jan 31, 2024
971dc35
use B_alloc.lld; wrap
mgates3 Jan 31, 2024
564f457
use params.matrixC
mgates3 Jan 31, 2024
d1eca32
test: replace DevDist with GridOrder
mgates3 Jan 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,7 @@ endif
tester_src += \
test/matrix_generator.cc \
test/matrix_params.cc \
test/matrix_utils.cc \
test/random.cc \
test/test.cc \
test/test_add.cc \
Expand Down
33 changes: 16 additions & 17 deletions src/internal/internal_henorm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,14 @@ void norm(
// Sum tile results into local results.
// Summing up local contributions only.
std::fill_n(values, A.n(), 0.0);
int64_t nb0 = A.tileNb(0);
int64_t mb0 = A.tileMb(0);
// off-diagonal blocks

jj = 0;
for (int64_t j = 0; j < A.nt(); ++j) {
int64_t nb = A.tileNb( j );

// off-diagonal blocks
int64_t ii = 0;
for (int64_t i = 0; i < A.mt(); ++i) {
int64_t nb = A.tileNb(j);
int64_t mb = A.tileMb(i);
if (A.tileIsLocal(i, j) &&
( ( lower && i > j) ||
Expand All @@ -210,27 +212,26 @@ void norm(
// col sums
blas::axpy(
nb, 1.0,
&tiles_sums[A.n()*i + j*nb0 ], 1,
&values[j*nb0], 1);
&tiles_sums[A.n()*i + jj ], 1,
&values[jj], 1);
// row sums
blas::axpy(
mb, 1.0,
&tiles_sums[A.m()*j + i*nb0 ], 1,
&values[i*mb0], 1);
&tiles_sums[A.m()*j + ii ], 1,
&values[ii], 1);
}
ii += mb;
}
}

// diagonal blocks
for (int64_t j = 0; j < A.nt(); ++j) {
int64_t nb = A.tileNb(j);
// diagonal blocks
if (A.tileIsLocal(j, j) ) {
// col sums
blas::axpy(
nb, 1.0,
&tiles_sums[A.n()*j + j*nb0 ], 1,
&values[j*nb0], 1);
&tiles_sums[A.n()*j + jj ], 1,
&values[jj], 1);
}
jj += nb;
}
}
//---------
Expand Down Expand Up @@ -345,8 +346,6 @@ void norm(

std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());

std::vector<real_t*> vals_dev_arrays(A.num_devices());

// devices_values used for max and Frobenius norms.
std::vector<real_t> devices_values;

Expand Down Expand Up @@ -374,7 +373,7 @@ void norm(
for (int device = 0; device < A.num_devices(); ++device) {
#pragma omp task slate_omp_default_none \
shared( A, devices_values ) \
shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
shared( vals_host_arrays, ijrange ) \
firstprivate(device, layout, lower, queue_index, in_norm, ldv) \
priority(priority)
{
Expand Down
46 changes: 21 additions & 25 deletions src/internal/internal_synorm.cc
mgates3 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void norm(
int64_t jj = 0;
#pragma omp taskgroup
for (int64_t j = 0; j < A.nt(); ++j) {
// diagonal tile
// diagonal tiles
if (j < A.mt() && A.tileIsLocal(j, j)) {
#pragma omp task slate_omp_default_none \
shared( A, tiles_sums ) \
Expand Down Expand Up @@ -190,17 +190,19 @@ void norm(
}
jj += A.tileNb(j);
}
//end omp taskgroup
// end omp taskgroup

// Sum tile results into local results.
// Summing up local contributions only.
std::fill_n(values, A.n(), 0.0);
int64_t nb0 = A.tileNb(0);
int64_t mb0 = A.tileMb(0);
// off-diagonal blocks

jj = 0;
for (int64_t j = 0; j < A.nt(); ++j) {
int64_t nb = A.tileNb( j );

// off-diagonal blocks
int64_t ii = 0;
for (int64_t i = 0; i < A.mt(); ++i) {
int64_t nb = A.tileNb(j);
int64_t mb = A.tileMb(i);
if (A.tileIsLocal(i, j) &&
( ( lower && i > j) ||
Expand All @@ -209,27 +211,26 @@ void norm(
// col sums
blas::axpy(
nb, 1.0,
&tiles_sums[A.n()*i + j*nb0 ], 1,
&values[j*nb0], 1);
&tiles_sums[A.n()*i + jj ], 1,
&values[jj], 1);
// row sums
blas::axpy(
mb, 1.0,
&tiles_sums[A.m()*j + i*nb0 ], 1,
&values[i*mb0], 1);
&tiles_sums[A.m()*j + ii ], 1,
&values[ii], 1);
}
ii += mb;
}
}

// diagonal blocks
for (int64_t j = 0; j < A.nt(); ++j) {
int64_t nb = A.tileNb(j);
// diagonal blocks
if (A.tileIsLocal(j, j) ) {
// col sums
blas::axpy(
nb, 1.0,
&tiles_sums[A.n()*j + j*nb0 ], 1,
&values[j*nb0], 1);
&tiles_sums[A.n()*j + jj ], 1,
&values[jj], 1);
}
jj += nb;
}
}
//---------
Expand Down Expand Up @@ -301,6 +302,7 @@ void norm(
}
}
}
// end omp taskgroup
}
}

Expand All @@ -325,7 +327,8 @@ void norm(
/// @ingroup norm_internal
///
template <typename scalar_t>
void norm(internal::TargetType<Target::Devices>,
void norm(
internal::TargetType<Target::Devices>,
Norm in_norm, NormScope scope, SymmetricMatrix<scalar_t>& A,
blas::real_type<scalar_t>* values,
int priority, int queue_index)
Expand All @@ -347,8 +350,6 @@ void norm(internal::TargetType<Target::Devices>,

std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());

std::vector<real_t*> vals_dev_arrays(A.num_devices());

// devices_values used for max and Frobenius norms.
std::vector<real_t> devices_values;

Expand Down Expand Up @@ -376,7 +377,7 @@ void norm(internal::TargetType<Target::Devices>,
for (int device = 0; device < A.num_devices(); ++device) {
#pragma omp task slate_omp_default_none \
shared( A, devices_values ) \
shared( vals_host_arrays, vals_dev_arrays, ijrange ) \
shared( vals_host_arrays, ijrange ) \
firstprivate(device, lower, queue_index, in_norm, ldv, layout) \
priority(priority)
{
Expand Down Expand Up @@ -493,11 +494,6 @@ void norm(internal::TargetType<Target::Devices>,
}
// end omp taskgroup

for (int device = 0; device < A.num_devices(); ++device) {
blas::Queue* queue = A.compute_queue(device, queue_index);
blas::device_free(vals_dev_arrays[device], *queue);
}

// Reduction over devices to local result.
if (in_norm == Norm::Max) {
*values = lapack::lange(in_norm,
Expand Down
4 changes: 1 addition & 3 deletions src/internal/internal_trnorm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,6 @@ void norm(

std::vector<std::vector<real_t> > vals_host_arrays(A.num_devices());

std::vector<real_t*> vals_dev_arrays(A.num_devices());

// devices_values used for max and Frobenius norms.
std::vector<real_t> devices_values;

Expand Down Expand Up @@ -399,7 +397,7 @@ void norm(
for (int device = 0; device < A.num_devices(); ++device) {
#pragma omp task slate_omp_default_none \
shared( A, devices_values ) \
shared( vals_host_arrays, vals_dev_arrays, irange, jrange ) \
shared( vals_host_arrays, irange, jrange ) \
firstprivate(device, queue_index, in_norm, ldv, layout) \
priority(priority)
{
Expand Down
Loading
Loading