Skip to content

Commit

Permalink
Merge branch 'use_lambdas_for_gpu_reduce' into 'master'
Browse files Browse the repository at this point in the history
Use lambdas for gpu reduce

See merge request npneq/inq!1154
  • Loading branch information
xavierandrade committed Oct 23, 2024
2 parents c962fea + 714271d commit a3b0217
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 134 deletions.
30 changes: 6 additions & 24 deletions external_libs/gpurun/include/gpu/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,24 +435,6 @@ gpu::array<Type, 1> run(long sizex, reduce const & redy, reduce const & redz, T
#include <mpi3/environment.hpp>
#include <catch2/catch_all.hpp>

struct ident {
GPU_FUNCTION auto operator()(long ii) const {
return double(ii);
}
};

struct prod {
GPU_FUNCTION auto operator()(long ix, long iy) const {
return double(ix)*double(iy);
}
};

struct prod3 {
GPU_FUNCTION auto operator()(long ix, long iy, long iz) const {
return double(ix)*double(iy)*double(iz);
}
};

TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {

using namespace Catch::literals;
Expand All @@ -463,7 +445,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {

int rank = 0;
for(long nn = 1; nn <= maxsize; nn *= 3){
CHECK(gpu::run(gpu::reduce(nn), -232.8, ident{}) == Approx(-232.8 + (nn*(nn - 1.0)/2.0)));
CHECK(gpu::run(gpu::reduce(nn), -232.8, [] GPU_LAMBDA (auto ii) { return double(ii);} ) == Approx(-232.8 + (nn*(nn - 1.0)/2.0)));
rank++;
}
}
Expand All @@ -476,7 +458,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
for(long nx = 1; nx <= maxsize; nx *= 5){
for(long ny = 1; ny <= maxsize; ny *= 5){

auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23, prod{});
auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23, [] GPU_LAMBDA (auto ix, auto iy) {return double(ix)*double(iy);});

CHECK(typeid(decltype(res)) == typeid(double));
CHECK(res == Approx(2.23 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0));
Expand All @@ -495,7 +477,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
for(long ny = 1; ny <= maxsize; ny *= 5){
for(long nz = 1; nz <= maxsize; nz *= 5){

auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, prod3{});
auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);});

CHECK(typeid(decltype(res)) == typeid(double));
CHECK(res == Approx(17.89 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0));
Expand All @@ -518,7 +500,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {

CHECK(typeid(decltype(res)) == typeid(gpu::array<double, 1>));
CHECK(res.size() == nx);
for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == -7.7 + double(ix)*ny*(ny - 1.0)/2.0);
for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(-7.7 + double(ix)*ny*(ny - 1.0)/2.0));
rank++;
}
}
Expand All @@ -534,12 +516,12 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
for(long ny = 1; ny <= maxsize; ny *= 5){
for(long nz = 1; nz <= maxsize; nz *= 5){

auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, prod3{});
auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);});

CHECK(typeid(decltype(res)) == typeid(gpu::array<double, 1>));

CHECK(res.size() == nx);
for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == 10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0);
for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0));
rank++;
}
}
Expand Down
15 changes: 3 additions & 12 deletions src/ground_state/calculator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,6 @@ class calculator {
public:
#endif

template <typename OccType, typename ArrayType>
struct state_conv_func {
OccType occ;
ArrayType arr;

GPU_FUNCTION double operator()(long ip) const {
return fabs(occ[ip]*arr[ip]);
}
};

template <typename NormResType>
static double state_convergence(systems::electrons & el, NormResType const & normres) {
CALI_CXX_MARK_FUNCTION;
Expand All @@ -87,8 +77,9 @@ class calculator {
for(int iphi = 0; iphi < el.kpin_size(); iphi++){
assert(el.occupations()[iphi].size() == normres[iphi].size());

auto func = state_conv_func<decltype(begin(el.occupations()[iphi])), decltype(begin(normres[iphi]))>{begin(el.occupations()[iphi]), begin(normres[iphi])};
state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, func);
state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, [occ = begin(el.occupations()[iphi]), arr = begin(normres[iphi])] GPU_LAMBDA (auto ip) {
return fabs(occ[ip]*arr[ip]);
});
}

el.kpin_states_comm().all_reduce_n(&state_conv, 1);
Expand Down
15 changes: 3 additions & 12 deletions src/hamiltonian/energy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,14 @@ namespace hamiltonian {
public:
#endif

template <typename OccType, typename ArrayType>
struct occ_sum_func {
OccType occ;
ArrayType arr;

GPU_FUNCTION double operator()(long ip) const {
return occ[ip]*real(arr[ip]);
}
};

template <typename OccType, typename ArrayType>
static double occ_sum(OccType const & occupations, ArrayType const & array) {
CALI_CXX_MARK_FUNCTION;

assert(occupations.size() == array.size());
auto func = occ_sum_func<decltype(begin(occupations)), decltype(begin(array))>{begin(occupations), begin(array)};
return gpu::run(gpu::reduce(array.size()), 0.0, func);
return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) {
return occ[ip]*real(arr[ip]);
});
}

public:
Expand Down
15 changes: 3 additions & 12 deletions src/hamiltonian/ks_hamiltonian.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,14 @@ class ks_hamiltonian {
public:
#endif

template <typename OccType, typename ArrayType>
struct occ_sum_func {
OccType occ;
ArrayType arr;

GPU_FUNCTION double operator()(long ip) const {
return occ[ip]*real(arr[ip]);
}
};

template <typename OccType, typename ArrayType>
static double occ_sum(OccType const & occupations, ArrayType const & array) {
CALI_CXX_MARK_FUNCTION;

assert(occupations.size() == array.size());
auto func = occ_sum_func<decltype(begin(occupations)), decltype(begin(array))>{begin(occupations), begin(array)};
return gpu::run(gpu::reduce(array.size()), 0.0, func);
return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) {
return occ[ip]*real(arr[ip]);
});
}

public:
Expand Down
41 changes: 9 additions & 32 deletions src/hamiltonian/projector_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,35 +233,6 @@ class projector_all {

////////////////////////////////////////////////////////////////////////////////////////////

template <typename OcType, typename PhiType, typename GPhiType>
struct force_term {
OcType oc;
PhiType phi;
GPhiType gphi;
constexpr auto operator()(int ist, int ip) const {
return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist]));
}
};

////////////////////////////////////////////////////////////////////////////////////////////

template <typename Projections, typename Coeff, typename Occupations>
struct energy_reduction {
Projections proj;
Coeff coe;
Occupations occ;
long spinor_size;

GPU_FUNCTION auto operator()(long ist, long ilm, long iproj) const {
auto ist_spinor = ist%spinor_size;
auto pp = proj[iproj][ilm][ist];
return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor];
}

};

////////////////////////////////////////////////////////////////////////////////////////////

template <typename KpointType, typename Occupations>
double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {

Expand Down Expand Up @@ -328,8 +299,12 @@ class projector_all {
}

auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0,
energy_reduction<decltype(begin(projections_all)), decltype(begin(coeff_)), decltype(begin(occupations))>
{begin(projections_all), begin(coeff_), begin(occupations), phi.local_spinor_set_size()});
[proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()]
GPU_LAMBDA (auto ist, auto ilm, auto iproj){
auto ist_spinor = ist%spinor_size;
auto pp = proj[iproj][ilm][ist];
return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor];
});

if(reduce_states and phi.set_comm().size() > 1) {
CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_states");
Expand Down Expand Up @@ -400,7 +375,9 @@ class projector_all {

CALI_CXX_MARK_SCOPE("projector_force_sum");
force[iproj] = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_sphere_size_), zero<vector3<double, covariant>>(),
force_term<decltype(begin(occs)), decltype(begin(sphere_phi_all[iproj])), decltype(begin(sphere_gphi_all[iproj]))>{begin(occs), begin(sphere_phi_all[iproj]), begin(sphere_gphi_all[iproj])});
[oc = begin(occs), phi = begin(sphere_phi_all[iproj]), gphi = begin(sphere_gphi_all[iproj])] GPU_LAMBDA (auto ist, auto ip) {
return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist]));
});
}

for(auto iproj = 0; iproj < nprojs_; iproj++) {
Expand Down
21 changes: 6 additions & 15 deletions src/observables/forces_stress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,6 @@
namespace inq {
namespace observables {

template <typename LongRangeType, typename ShortRangeType, typename GDensityType>
struct loc_pot {

LongRangeType v1;
ShortRangeType v2;
GDensityType gdensityp;

GPU_FUNCTION auto operator()(long ip) const {
return (v1[ip] + v2[ip])*gdensityp[ip];
}
};

struct forces_stress {
gpu::array<vector3<double>, 1> forces;
gpu::array<double, 2> stress;
Expand All @@ -45,7 +33,9 @@ struct forces_stress {
calculate(ions, electrons, ham);
}

#ifndef ENABLE_GPU
private:
#endif

template <typename HamiltonianType>
void calculate(const systems::ions & ions, systems::electrons const & electrons, HamiltonianType const & ham){
Expand Down Expand Up @@ -87,9 +77,10 @@ struct forces_stress {
auto ionic_short_range = electrons.atomic_pot().local_potential(electrons.states_comm(), electrons.density_basis(), ions, iatom);

auto force_cov = -gpu::run(gpu::reduce(electrons.density_basis().local_size()), zero<vector3<double, inq::covariant>>(),
loc_pot<decltype(begin(ionic_long_range.linear())), decltype(begin(ionic_short_range.linear())), decltype(begin(gdensity.linear()))>
{begin(ionic_long_range.linear()), begin(ionic_short_range.linear()), begin(gdensity.linear())});

[v1 = begin(ionic_long_range.linear()), v2 = begin(ionic_short_range.linear()), gdensityp = begin(gdensity.linear())] GPU_LAMBDA (auto ip) {
return (v1[ip] + v2[ip])*gdensityp[ip];
});

forces_local[iatom] = electrons.density_basis().volume_element()*ions.cell().metric().to_cartesian(force_cov);
}

Expand Down
32 changes: 6 additions & 26 deletions src/operations/overlap_diagonal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,6 @@
namespace inq {
namespace operations {

template <class mat_type>
struct overlap_diagonal_mult {

double factor;
mat_type mat1;
mat_type mat2;

GPU_FUNCTION auto operator()(long ist, long ip) const {
return factor*conj(mat1[ip][ist])*mat2[ip][ist];
}
};

template <class Basis, class PhiMatrix>
gpu::array<typename PhiMatrix::element_type, 1> overlap_diagonal_impl(Basis const & basis, PhiMatrix const & phi1_matrix, PhiMatrix const & phi2_matrix){
CALI_CXX_MARK_SCOPE("overlap_diagonal(2arg)");
Expand All @@ -51,7 +39,9 @@ gpu::array<typename PhiMatrix::element_type, 1> overlap_diagonal_impl(Basis cons
overlap_vector[0] *= basis.volume_element();
} else {
overlap_vector = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero<type>(),
overlap_diagonal_mult<decltype(begin(phi1_matrix))>{basis.volume_element(), begin(phi1_matrix), begin(phi2_matrix)});
[factor = basis.volume_element(), mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) {
return factor*conj(mat1[ip][ist])*mat2[ip][ist];
});
}

if(basis.comm().size() > 1){
Expand Down Expand Up @@ -115,18 +105,6 @@ struct value_and_norm {
Type norm;
};

template <class mat_type>
struct overlap_diagonal_normalized_mult {

mat_type mat1;
mat_type mat2;

GPU_FUNCTION auto operator()(long ist, long ip) const {
return value_and_norm<decltype(conj(mat1[0][0])*mat2[0][0])>{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]};
}

};

struct identity {
template <typename Type>
GPU_FUNCTION auto operator()(Type const tt) const {
Expand All @@ -153,7 +131,9 @@ auto overlap_diagonal_normalized_impl(Basis const & basis, PhiMatrix const & phi
using type = typename PhiMatrix::element_type;

auto overlap_and_norm = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero<value_and_norm<type>>(),
overlap_diagonal_normalized_mult<decltype(begin(phi1_matrix))>{begin(phi1_matrix), begin(phi2_matrix)});
[mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) {
return value_and_norm<type>{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]};
});

if(basis.comm().size() > 1){
CALI_CXX_MARK_SCOPE("overlap_diagonal_normalized::reduce");
Expand Down
2 changes: 1 addition & 1 deletion src/operations/sum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ typename array_type::element sum(const array_type & phi){
CALI_CXX_MARK_SCOPE("sum(1arg)");
auto init = zero<typename array_type::element>();
if(phi.size() == 0) return init;
return gpu::run(gpu::reduce(phi.size()), init, gpu::array_access<decltype(begin(phi))>{begin(phi)});
return gpu::run(gpu::reduce(phi.size()), init, [ph = begin(phi)] GPU_LAMBDA (auto ii) { return ph[ii]; });
}

template <class ArrayType, class UnaryOp>
Expand Down

0 comments on commit a3b0217

Please sign in to comment.