Skip to content

Commit

Permalink
Merge pull request #147 from neil-lindquist/blas-tile-life
Browse files Browse the repository at this point in the history
Remove tile life from internal routines and BLAS routines
  • Loading branch information
neil-lindquist authored Dec 1, 2023
2 parents 0e9cac3 + abaf8c7 commit 68ab7f4
Show file tree
Hide file tree
Showing 13 changed files with 220 additions and 99 deletions.
56 changes: 27 additions & 29 deletions src/gemmA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,20 +131,19 @@ void gemmA(
int tag_0 = 0;
C.template listReduce( reduce_list_C, layout, tag_0 );
}
// Clean the memory introduced by internal::gemmA on Devices
if (target == Target::Devices) {
#pragma omp task depend( in:gemmA[ 0 ] ) \
shared( B, C )
{
auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 );
B_col_0.releaseRemoteWorkspace();
B_col_0.releaseLocalWorkspace();

auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 );
C_col_0.releaseRemoteWorkspace();
C_col_0.tileUpdateAllOrigin();
C_col_0.releaseLocalWorkspace();
}

// Clean up workspace
#pragma omp task depend( in:gemmA[ 0 ] ) \
shared( B, C )
{
auto B_col_0 = B.sub( 0, B.mt()-1, 0, 0 );
B_col_0.releaseRemoteWorkspace();
B_col_0.releaseLocalWorkspace();

auto C_col_0 = C.sub( 0, C.mt()-1, 0, 0 );
C_col_0.releaseRemoteWorkspace();
C_col_0.tileUpdateAllOrigin();
C_col_0.releaseLocalWorkspace();
}

// broadcast (with lookahead) and multiply the rest of the columns
Expand Down Expand Up @@ -194,21 +193,20 @@ void gemmA(
int tag_k = k;
C.template listReduce( reduce_list_C, layout, tag_k );
}
// Clean the memory introduced by internal::gemmA on Devices
if (target == Target::Devices) {
#pragma omp task depend( in:gemmA[ k ] ) \
shared( B, C ) \
firstprivate( k )
{
auto B_col_k = B.sub( 0, B.mt()-1, k, k );
B_col_k.releaseRemoteWorkspace();
B_col_k.releaseLocalWorkspace();

auto C_col_k = C.sub( 0, C.mt()-1, k, k );
C_col_k.releaseRemoteWorkspace();
C_col_k.tileUpdateAllOrigin();
C_col_k.releaseLocalWorkspace();
}

// Clean up workspace
#pragma omp task depend( in:gemmA[ k ] ) \
shared( B, C ) \
firstprivate( k )
{
auto B_col_k = B.sub( 0, B.mt()-1, k, k );
B_col_k.releaseRemoteWorkspace();
B_col_k.releaseLocalWorkspace();

auto C_col_k = C.sub( 0, C.mt()-1, k, k );
C_col_k.releaseRemoteWorkspace();
C_col_k.tileUpdateAllOrigin();
C_col_k.releaseLocalWorkspace();
}
}
#pragma omp taskwait
Expand Down
2 changes: 2 additions & 0 deletions src/heev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ void heev(

// Copy diagonal and super-diagonal to vectors.
internal::copyhb2st( Aband, Lambda, E );

Aband.releaseRemoteWorkspace();
}

// 3. Tri-diagonal eigenvalue solver.
Expand Down
10 changes: 5 additions & 5 deletions src/hegst.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ void hegst(
uint8_t* column = column_vector.data();

if (target == Target::Devices) {
// The work::trsm (itype=1) and work::trmm (itype=2,3)
// routines use 2 queues (queue 0,1). All other
// internal::routines here use the default queue (queue 0).
// So 2 queues need to be allocated.
A.allocateBatchArrays(0, 2+lookahead); // (batch size, num_queues)
// The work::trsm (itype=1) routine uses 2 queues (queue 0,1).
// The work::trmm (itype=2,3) routine uses 1 queue (queue 0).
// All other internal::routines here use the default queue (queue 0).
int64_t num_queues = (itype == 1) ? 2 : 1;
A.allocateBatchArrays(0, num_queues+lookahead); // (batch size, num_queues)
A.reserveDeviceWorkspace();
}

Expand Down
63 changes: 53 additions & 10 deletions src/hemmA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,21 @@ void hemmA(
using BcastList = typename Matrix<scalar_t>::BcastList;

const scalar_t one = 1.0;
const int priority_0 = 0;
const int queue_0 = 0;

// Assumes column major
const Layout layout = Layout::ColMajor;

// Options
int64_t lookahead = get_option<int64_t>( opts, Option::Lookahead, 1 );

// Use only TileReleaseStrategy::Slate for hemmA.
// Internal routines (hemmA and gemmA) called here won't release
// any tiles. This routine will clean up tiles.
Options opts2 = opts;
opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;

// if on right, change to left by transposing A, B, C to get
// op(C) = op(A)*op(B)
if (side == Side::Right) {
Expand Down Expand Up @@ -190,17 +198,26 @@ void hemmA(
Side::Left,
alpha, A.sub(0, 0),
B.sub(0, 0, 0, B.nt()-1),
beta, C.sub(0, 0, 0, C.nt()-1));
beta, C.sub(0, 0, 0, C.nt()-1),
priority_0, opts2 );

if (A.mt()-1 > 0) {
internal::gemmA<target>(
alpha, A.sub(1, A.mt()-1, 0, 0),
B.sub(0, 0, 0, B.nt()-1),
beta, C.sub(1, C.mt()-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );
}
}

// Clean up workspace
#pragma omp task depend( in:gemm[ 0 ] ) shared( B )
{
auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 );
B_col_0.releaseRemoteWorkspace();
B_col_0.releaseLocalWorkspace();
}

// Main loop
for (int64_t k = 1; k < A.nt(); ++k) {

Expand Down Expand Up @@ -268,23 +285,31 @@ void hemmA(
alpha, conj_transpose( Arow_k ),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(0, k-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );

internal::hemmA<Target::HostTask>(
Side::Left,
alpha, A.sub(k, k),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(k, k, 0, C.nt()-1));
one, C.sub(k, k, 0, C.nt()-1),
priority_0, opts2 );

if (A.mt()-1 > k) {
internal::gemmA<target>(
alpha, A.sub(k+1, A.mt()-1, k, k),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(k+1, C.mt()-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );
}
}

// Clean up workspace
#pragma omp task depend( in:gemm[ k ] ) shared( B )
{
auto B_col_k = B.sub( k, k, 0, B.nt()-1 );
B_col_k.releaseRemoteWorkspace();
B_col_k.releaseLocalWorkspace();
}
}

#pragma omp task depend(in:gemm[A.nt()-1])
Expand Down Expand Up @@ -423,18 +448,27 @@ void hemmA(
Side::Left,
alpha, A.sub(0, 0),
B.sub(0, 0, 0, B.nt()-1),
beta, C.sub(0, 0, 0, C.nt()-1));
beta, C.sub(0, 0, 0, C.nt()-1),
priority_0, opts2 );

if (A.mt()-1 > 0) {
auto Arow_k = A.sub(0, 0, 1, A.nt()-1);
internal::gemmA<target>(
alpha, conj_transpose( Arow_k ),
B.sub(0, 0, 0, B.nt()-1),
beta, C.sub(1, C.mt()-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );
}
}

// Clean up workspace
#pragma omp task depend( in:gemm[ 0 ] ) shared( B )
{
auto B_col_0 = B.sub( 0, 0, 0, B.nt()-1 );
B_col_0.releaseRemoteWorkspace();
B_col_0.releaseLocalWorkspace();
}

// Main loop
for (int64_t k = 1; k < A.nt(); ++k) {

Expand Down Expand Up @@ -500,23 +534,32 @@ void hemmA(
alpha, A.sub(0, k-1, k, k),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(0, k-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );

internal::hemmA<Target::HostTask>(
Side::Left,
alpha, A.sub(k, k),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(k, k, 0, C.nt()-1));
one, C.sub(k, k, 0, C.nt()-1),
priority_0, opts2 );

if (A.nt()-1 > k) {
auto Arow_k = A.sub(k, k, k+1, A.nt()-1);
internal::gemmA<target>(
alpha, conj_transpose( Arow_k ),
B.sub(k, k, 0, B.nt()-1),
one, C.sub(k+1, C.mt()-1, 0, C.nt()-1),
layout);
layout, priority_0, queue_0, opts2 );
}
}

// Clean up workspace
#pragma omp task depend( in:gemm[ k ] ) shared( B )
{
auto B_col_k = B.sub( k, k, 0, B.nt()-1 );
B_col_k.releaseRemoteWorkspace();
B_col_k.releaseLocalWorkspace();
}
}

#pragma omp task depend(in:gemm[A.nt()-1])
Expand Down
2 changes: 0 additions & 2 deletions src/internal/internal_copyhb2st.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ void copyhb2st(internal::TargetType<Target::HostTask>,
auto T = A(i-1, i);
E[E_index] = real( T(T.mb()-1, 0) );
E_index += 1;
A.tileTick(i-1, i);
}

// Copy main diagonal to D.
Expand All @@ -82,7 +81,6 @@ void copyhb2st(internal::TargetType<Target::HostTask>,
E[E_index + j] = real( T(j, j+1) );
}
E_index += len-1;
A.tileTick(i, i);
}
}

Expand Down
2 changes: 0 additions & 2 deletions src/internal/internal_copytb2bd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ void copytb2bd(internal::TargetType<Target::HostTask>,
auto T = A(i-1, i);
E[E_index] = real( T(T.mb()-1, 0) );
E_index += 1;
A.tileTick(i-1, i);
}

// Copy main diagonal to D.
Expand All @@ -80,7 +79,6 @@ void copytb2bd(internal::TargetType<Target::HostTask>,
E[E_index + j] = real( T(j, j+1) );
}
E_index += len-1;
A.tileTick(i, i);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/internal/internal_getrf_tntpiv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ void getrf_tntpiv_panel(
aux_pivot[ 0 ], diag_len, A.mt(), mb );
}

Awork.tileTick( i2, 0 );
Awork.tileRelease( i2, 0 );
}
}
else {
Expand Down
8 changes: 2 additions & 6 deletions src/internal/internal_unmtr_hb2st.cc
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,6 @@ void unmtr_hb2st( internal::TargetType<target>,
}
}
}
V.tileTick(0, r);
} // if C(i, k) is local
} // inner for loop

Expand All @@ -451,11 +450,8 @@ void unmtr_hb2st( internal::TargetType<target>,
Vr_data[ii + ii*ldv] = tau[ii];
}
}
if (target == Target::Devices) {
for (int d = 0; d < C.num_devices(); ++d) {
V_.tileRelease(0, r, d);
}
}
V.releaseLocalWorkspaceTile(0, r);
V.releaseRemoteWorkspaceTile(0, r);
}
}
} // inner loop
Expand Down
2 changes: 2 additions & 0 deletions src/svd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ void svd(

// Copy diagonal and super-diagonal to vectors.
internal::copytb2bd(Aband, Sigma, E);

Aband.releaseRemoteWorkspace();
}

int64_t ncvt = 0, nru = 0, ldvt = 1, ldu = 1;
Expand Down
Loading

0 comments on commit 68ab7f4

Please sign in to comment.