From 1ec606f1c0360d9464828a24b6ba7042da0f10da Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 15:42:30 -0700 Subject: [PATCH 1/8] Directly pass lambdas instead of struct in overlap diagonal.. --- src/operations/overlap_diagonal.hpp | 32 ++++++----------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/src/operations/overlap_diagonal.hpp b/src/operations/overlap_diagonal.hpp index be74495a..9840ff77 100644 --- a/src/operations/overlap_diagonal.hpp +++ b/src/operations/overlap_diagonal.hpp @@ -22,18 +22,6 @@ namespace inq { namespace operations { -template -struct overlap_diagonal_mult { - - double factor; - mat_type mat1; - mat_type mat2; - - GPU_FUNCTION auto operator()(long ist, long ip) const { - return factor*conj(mat1[ip][ist])*mat2[ip][ist]; - } -}; - template gpu::array overlap_diagonal_impl(Basis const & basis, PhiMatrix const & phi1_matrix, PhiMatrix const & phi2_matrix){ CALI_CXX_MARK_SCOPE("overlap_diagonal(2arg)"); @@ -51,7 +39,9 @@ gpu::array overlap_diagonal_impl(Basis cons overlap_vector[0] *= basis.volume_element(); } else { overlap_vector = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero(), - overlap_diagonal_mult{basis.volume_element(), begin(phi1_matrix), begin(phi2_matrix)}); + [factor = basis.volume_element(), mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) { + return factor*conj(mat1[ip][ist])*mat2[ip][ist]; + }); } if(basis.comm().size() > 1){ @@ -115,18 +105,6 @@ struct value_and_norm { Type norm; }; -template -struct overlap_diagonal_normalized_mult { - - mat_type mat1; - mat_type mat2; - - GPU_FUNCTION auto operator()(long ist, long ip) const { - return value_and_norm{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]}; - } - -}; - struct identity { template GPU_FUNCTION auto operator()(Type const tt) const { @@ -153,7 +131,9 @@ auto overlap_diagonal_normalized_impl(Basis const & basis, PhiMatrix const & phi using type = typename PhiMatrix::element_type; auto overlap_and_norm = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero>(), - overlap_diagonal_normalized_mult{begin(phi1_matrix), begin(phi2_matrix)}); + [mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) { + return value_and_norm{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]}; + }); if(basis.comm().size() > 1){ CALI_CXX_MARK_SCOPE("overlap_diagonal_normalized::reduce"); From f020cffedbd47bb06b1573ac568b69251c91c895 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Mon, 21 Oct 2024 14:58:10 -0700 Subject: [PATCH 2/8] Use a lambda to calculate the non-local energy. --- src/hamiltonian/projector_all.hpp | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp index 2fa5b15b..4c2d3324 100644 --- a/src/hamiltonian/projector_all.hpp +++ b/src/hamiltonian/projector_all.hpp @@ -245,23 +245,6 @@ class projector_all { //////////////////////////////////////////////////////////////////////////////////////////// - template - struct energy_reduction { - Projections proj; - Coeff coe; - Occupations occ; - long spinor_size; - - GPU_FUNCTION auto operator()(long ist, long ilm, long iproj) const { - auto ist_spinor = ist%spinor_size; - auto pp = proj[iproj][ilm][ist]; - return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor]; - } - - }; - - //////////////////////////////////////////////////////////////////////////////////////////// - template double energy(states::orbital_set const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const { @@ -328,8 +311,12 @@ class projector_all { } auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0, - energy_reduction - {begin(projections_all), begin(coeff_), begin(occupations), phi.local_spinor_set_size()}); + [proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()] + GPU_LAMBDA (auto ist, auto ilm, auto iproj){ + auto ist_spinor = ist%spinor_size; + auto pp = proj[iproj][ilm][ist]; + return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor]; + }); if(reduce_states and phi.set_comm().size() > 1) { CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_states"); From 7edf5905a6be980ca910651b5de82bcd97c7abaa Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:10:24 -0700 Subject: [PATCH 3/8] Use lambdas in the test of gpu::reduce. --- external_libs/gpurun/include/gpu/reduce.hpp | 30 +++++---------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp index 942efff3..13d560ed 100644 --- a/external_libs/gpurun/include/gpu/reduce.hpp +++ b/external_libs/gpurun/include/gpu/reduce.hpp @@ -435,24 +435,6 @@ gpu::array run(long sizex, reduce const & redy, reduce const & redz, T #include #include -struct ident { - GPU_FUNCTION auto operator()(long ii) const { - return double(ii); - } -}; - -struct prod { - GPU_FUNCTION auto operator()(long ix, long iy) const { - return double(ix)*double(iy); - } -}; - -struct prod3 { - GPU_FUNCTION auto operator()(long ix, long iy, long iz) const { - return double(ix)*double(iy)*double(iz); - } -}; - TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { using namespace Catch::literals; @@ -463,7 +445,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { int rank = 0; for(long nn = 1; nn <= maxsize; nn *= 3){ - CHECK(gpu::run(gpu::reduce(nn), -232.8, ident{}) == Approx(-232.8 + (nn*(nn - 1.0)/2.0))); + CHECK(gpu::run(gpu::reduce(nn), -232.8, [] GPU_LAMBDA (auto ii) { return double(ii);} ) == Approx(-232.8 + (nn*(nn - 1.0)/2.0))); rank++; } } @@ -476,7 +458,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { for(long nx = 1; nx <= maxsize; nx *= 5){ for(long ny = 1; ny <= maxsize; ny *= 5){ - auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23, prod{}); + auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23, [] GPU_LAMBDA (auto ix, auto iy) {return double(ix)*double(iy);}); CHECK(typeid(decltype(res)) == typeid(double)); CHECK(res == Approx(2.23 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0)); @@ -495,7 +477,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { for(long ny = 1; ny <= maxsize; ny *= 5){ for(long nz = 1; nz <= maxsize; nz *= 5){ - auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, prod3{}); + auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);}); CHECK(typeid(decltype(res)) == typeid(double)); CHECK(res == Approx(17.89 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0)); @@ -518,7 +500,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { CHECK(typeid(decltype(res)) == typeid(gpu::array)); CHECK(res.size() == nx); - for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == -7.7 + double(ix)*ny*(ny - 1.0)/2.0); + for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(-7.7 + double(ix)*ny*(ny - 1.0)/2.0)); rank++; } } @@ -534,12 +516,12 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) { for(long ny = 1; ny <= maxsize; ny *= 5){ for(long nz = 1; nz <= maxsize; nz *= 5){ - auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, prod3{}); + auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);}); CHECK(typeid(decltype(res)) == typeid(gpu::array)); CHECK(res.size() == nx); - for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == 10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0); + for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0)); rank++; } } From becd8e81fa03624b9f32934d02db0dba6465ac76 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:21:05 -0700 Subject: [PATCH 4/8] Use a lambda in state_convergence. --- src/ground_state/calculator.hpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/ground_state/calculator.hpp b/src/ground_state/calculator.hpp index 7362fba1..6394246a 100644 --- a/src/ground_state/calculator.hpp +++ b/src/ground_state/calculator.hpp @@ -68,16 +68,6 @@ class calculator { public: #endif - template - struct state_conv_func { - OccType occ; - ArrayType arr; - - GPU_FUNCTION double operator()(long ip) const { - return fabs(occ[ip]*arr[ip]); - } - }; - template static double state_convergence(systems::electrons & el, NormResType const & normres) { CALI_CXX_MARK_FUNCTION; @@ -87,8 +77,9 @@ class calculator { for(int iphi = 0; iphi < el.kpin_size(); iphi++){ assert(el.occupations()[iphi].size() == normres[iphi].size()); - auto func = state_conv_func{begin(el.occupations()[iphi]), begin(normres[iphi])}; - state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, func); + state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, [occ = begin(el.occupations()[iphi]), arr = begin(normres[iphi])] GPU_LAMBDA (auto ip) { + return fabs(occ[ip]*arr[ip]); + }); } el.kpin_states_comm().all_reduce_n(&state_conv, 1); From 10f80f2fbe8817049a6a4b51aebdbd61d3d1bdae Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:26:12 -0700 Subject: [PATCH 5/8] Use a lambda in occ_sum. --- src/hamiltonian/energy.hpp | 15 +++------------ src/hamiltonian/ks_hamiltonian.hpp | 15 +++------------ 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/src/hamiltonian/energy.hpp b/src/hamiltonian/energy.hpp index de948dc6..643aaf98 100644 --- a/src/hamiltonian/energy.hpp +++ b/src/hamiltonian/energy.hpp @@ -32,23 +32,14 @@ namespace hamiltonian { public: #endif - template - struct occ_sum_func { - OccType occ; - ArrayType arr; - - GPU_FUNCTION double operator()(long ip) const { - return occ[ip]*real(arr[ip]); - } - }; - template static double occ_sum(OccType const & occupations, ArrayType const & array) { CALI_CXX_MARK_FUNCTION; assert(occupations.size() == array.size()); - auto func = occ_sum_func{begin(occupations), begin(array)}; - return gpu::run(gpu::reduce(array.size()), 0.0, func); + return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) { + return occ[ip]*real(arr[ip]); + }); } public: diff --git a/src/hamiltonian/ks_hamiltonian.hpp b/src/hamiltonian/ks_hamiltonian.hpp index 8ce069dd..cc9d3721 100644 --- a/src/hamiltonian/ks_hamiltonian.hpp +++ b/src/hamiltonian/ks_hamiltonian.hpp @@ -54,23 +54,14 @@ class ks_hamiltonian { public: #endif - template - struct occ_sum_func { - OccType occ; - ArrayType arr; - - GPU_FUNCTION double operator()(long ip) const { - return occ[ip]*real(arr[ip]); - } - }; - template static double occ_sum(OccType const & occupations, ArrayType const & array) { CALI_CXX_MARK_FUNCTION; assert(occupations.size() == array.size()); - auto func = occ_sum_func{begin(occupations), begin(array)}; - return gpu::run(gpu::reduce(array.size()), 0.0, func); + return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) { + return occ[ip]*real(arr[ip]); + }); } public: From 9e9c236fa81532f6873d04d9321a11f1ea4477a7 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:32:26 -0700 Subject: [PATCH 6/8] Use a lambda for the calculation of the non-local force. --- src/hamiltonian/projector_all.hpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp index 4c2d3324..4ec55dbd 100644 --- a/src/hamiltonian/projector_all.hpp +++ b/src/hamiltonian/projector_all.hpp @@ -233,18 +233,6 @@ class projector_all { //////////////////////////////////////////////////////////////////////////////////////////// - template - struct force_term { - OcType oc; - PhiType phi; - GPhiType gphi; - constexpr auto operator()(int ist, int ip) const { - return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist])); - } - }; - - //////////////////////////////////////////////////////////////////////////////////////////// - template double energy(states::orbital_set const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const { @@ -387,7 +375,9 @@ class projector_all { CALI_CXX_MARK_SCOPE("projector_force_sum"); force[iproj] = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_sphere_size_), zero>(), - force_term{begin(occs), begin(sphere_phi_all[iproj]), begin(sphere_gphi_all[iproj])}); + [oc = begin(occs), phi = begin(sphere_phi_all[iproj]), gphi = begin(sphere_gphi_all[iproj])] GPU_LAMBDA (auto ist, auto ip) { + return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist])); + }); } for(auto iproj = 0; iproj < nprojs_; iproj++) { From da1b88206e6f4b1797759f43df9591a729246f87 Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:39:47 -0700 Subject: [PATCH 7/8] Use a lambda in the calculation of the local part of the forces. --- src/observables/forces_stress.hpp | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/observables/forces_stress.hpp b/src/observables/forces_stress.hpp index ebc4690d..366f7895 100644 --- a/src/observables/forces_stress.hpp +++ b/src/observables/forces_stress.hpp @@ -19,18 +19,6 @@ namespace inq { namespace observables { -template -struct loc_pot { - - LongRangeType v1; - ShortRangeType v2; - GDensityType gdensityp; - - GPU_FUNCTION auto operator()(long ip) const { - return (v1[ip] + v2[ip])*gdensityp[ip]; - } -}; - struct forces_stress { gpu::array, 1> forces; gpu::array stress; @@ -45,7 +33,9 @@ struct forces_stress { calculate(ions, electrons, ham); } +#ifndef ENABLE_GPU private: +#endif template void calculate(const systems::ions & ions, systems::electrons const & electrons, HamiltonianType const & ham){ @@ -87,9 +77,10 @@ struct forces_stress { auto ionic_short_range = electrons.atomic_pot().local_potential(electrons.states_comm(), electrons.density_basis(), ions, iatom); auto force_cov = -gpu::run(gpu::reduce(electrons.density_basis().local_size()), zero>(), - loc_pot - {begin(ionic_long_range.linear()), begin(ionic_short_range.linear()), begin(gdensity.linear())}); - + [v1 = begin(ionic_long_range.linear()), v2 = begin(ionic_short_range.linear()), gdensityp = begin(gdensity.linear())] GPU_LAMBDA (auto ip) { + return (v1[ip] + v2[ip])*gdensityp[ip]; + }); + forces_local[iatom] = electrons.density_basis().volume_element()*ions.cell().metric().to_cartesian(force_cov); } From 714271d399ed1ca1733d58ca01c17c66c129111d Mon Sep 17 00:00:00 2001 From: Xavier Andrade Date: Tue, 22 Oct 2024 21:47:06 -0700 Subject: [PATCH 8/8] Use a lambda for the gpu::reduce in sum. --- src/operations/sum.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operations/sum.hpp b/src/operations/sum.hpp index 75346623..58db9f33 100644 --- a/src/operations/sum.hpp +++ b/src/operations/sum.hpp @@ -28,7 +28,7 @@ typename array_type::element sum(const array_type & phi){ CALI_CXX_MARK_SCOPE("sum(1arg)"); auto init = zero(); if(phi.size() == 0) return init; - return gpu::run(gpu::reduce(phi.size()), init, gpu::array_access{begin(phi)}); + return gpu::run(gpu::reduce(phi.size()), init, [ph = begin(phi)] GPU_LAMBDA (auto ii) { return ph[ii]; }); } template