From 1ec606f1c0360d9464828a24b6ba7042da0f10da Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 15:42:30 -0700
Subject: [PATCH 1/8] Directly pass lambdas instead of struct in overlap
 diagonal..

---
 src/operations/overlap_diagonal.hpp | 32 ++++++-----------------------
 1 file changed, 6 insertions(+), 26 deletions(-)
diff --git a/src/operations/overlap_diagonal.hpp b/src/operations/overlap_diagonal.hpp
index be74495a..9840ff77 100644
--- a/src/operations/overlap_diagonal.hpp
+++ b/src/operations/overlap_diagonal.hpp
@@ -22,18 +22,6 @@
 namespace inq {
 namespace operations {
 
-template <class mat_type>
-struct overlap_diagonal_mult {
-
-	double factor;
-	mat_type mat1;
-	mat_type mat2;
-	
-	GPU_FUNCTION auto operator()(long ist, long ip) const {
-		return factor*conj(mat1[ip][ist])*mat2[ip][ist];
-	}
-};
-
 template <class Basis, class PhiMatrix>
 gpu::array<typename PhiMatrix::element_type, 1> overlap_diagonal_impl(Basis const & basis, PhiMatrix const & phi1_matrix, PhiMatrix const & phi2_matrix){
 	CALI_CXX_MARK_SCOPE("overlap_diagonal(2arg)");
@@ -51,7 +39,9 @@ gpu::array<typename PhiMatrix::element_type, 1> overlap_diagonal_impl(Basis cons
 		overlap_vector[0] *= basis.volume_element();
 	} else {
 		overlap_vector = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero<type>(),
-															overlap_diagonal_mult<decltype(begin(phi1_matrix))>{basis.volume_element(), begin(phi1_matrix), begin(phi2_matrix)});
+															[factor = basis.volume_element(), mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) {
+																return factor*conj(mat1[ip][ist])*mat2[ip][ist];
+															});
 	}
 
 	if(basis.comm().size() > 1){
@@ -115,18 +105,6 @@ struct value_and_norm {
 	Type norm;
 };
 
-template <class mat_type>
-struct overlap_diagonal_normalized_mult {
-
-	mat_type mat1;
-	mat_type mat2;
-	
-	GPU_FUNCTION auto operator()(long ist, long ip) const {
-		return value_and_norm<decltype(conj(mat1[0][0])*mat2[0][0])>{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]};
-	}
-	
-};
-
 struct identity {
 	template <typename Type>
 	GPU_FUNCTION auto operator()(Type const tt) const {
@@ -153,7 +131,9 @@ auto overlap_diagonal_normalized_impl(Basis const & basis, PhiMatrix const & phi
 	using type = typename PhiMatrix::element_type;
 
 	auto overlap_and_norm = gpu::run(nn, gpu::reduce(phi1_matrix.size()), zero<value_and_norm<type>>(),
-																	 overlap_diagonal_normalized_mult<decltype(begin(phi1_matrix))>{begin(phi1_matrix), begin(phi2_matrix)});
+																	 [mat1 = begin(phi1_matrix), mat2 = begin(phi2_matrix)] GPU_LAMBDA (auto ist, auto ip) {
+																		 return value_and_norm<type>{conj(mat1[ip][ist])*mat2[ip][ist], conj(mat2[ip][ist])*mat2[ip][ist]};
+																	 });
 	
 	if(basis.comm().size() > 1){
 		CALI_CXX_MARK_SCOPE("overlap_diagonal_normalized::reduce");

From f020cffedbd47bb06b1573ac568b69251c91c895 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Mon, 21 Oct 2024 14:58:10 -0700
Subject: [PATCH 2/8] Use a lambda to calculate the non-local energy.

---
 src/hamiltonian/projector_all.hpp | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp
index 2fa5b15b..4c2d3324 100644
--- a/src/hamiltonian/projector_all.hpp
+++ b/src/hamiltonian/projector_all.hpp
@@ -245,23 +245,6 @@ class projector_all {
 
 	////////////////////////////////////////////////////////////////////////////////////////////
 
-	template <typename Projections, typename Coeff, typename Occupations>
-	struct energy_reduction {
-		Projections proj;
-		Coeff coe;
-		Occupations occ;
-		long spinor_size;
-
-		GPU_FUNCTION auto operator()(long ist, long ilm, long iproj) const {
-			auto ist_spinor = ist%spinor_size;
-			auto pp = proj[iproj][ilm][ist];
-			return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor];
-		}
-		
-	};
-	
-	////////////////////////////////////////////////////////////////////////////////////////////
-
 	template <typename KpointType, typename Occupations>
 	double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {
     
@@ -328,8 +311,12 @@ class projector_all {
 		}
 		
 		auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), 0.0,
-											 energy_reduction<decltype(begin(projections_all)), decltype(begin(coeff_)), decltype(begin(occupations))>
-											 {begin(projections_all), begin(coeff_), begin(occupations), phi.local_spinor_set_size()});
+											 [proj = begin(projections_all), coe = begin(coeff_), occ = begin(occupations), spinor_size = phi.local_spinor_set_size()]
+											 GPU_LAMBDA (auto ist, auto ilm, auto iproj){
+												 auto ist_spinor = ist%spinor_size;
+												 auto pp = proj[iproj][ilm][ist];
+												 return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor];
+											 });
 		
 		if(reduce_states and phi.set_comm().size() > 1) {
 			CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_states");

From 7edf5905a6be980ca910651b5de82bcd97c7abaa Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:10:24 -0700
Subject: [PATCH 3/8] Use lambdas in the test of gpu::reduce.

---
 external_libs/gpurun/include/gpu/reduce.hpp | 30 +++++----------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index 942efff3..13d560ed 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -435,24 +435,6 @@ gpu::array<Type, 1>  run(long sizex, reduce const & redy, reduce const & redz, T
 #include <mpi3/environment.hpp>
 #include <catch2/catch_all.hpp>
 
-struct ident {
-  GPU_FUNCTION auto operator()(long ii) const {
-    return double(ii);
-  }
-};
-
-struct prod {
-  GPU_FUNCTION auto operator()(long ix, long iy) const {
-    return double(ix)*double(iy);
-  }
-};
-  
-struct prod3 {
-  GPU_FUNCTION auto operator()(long ix, long iy, long iz) const {
-    return double(ix)*double(iy)*double(iz);
-  }
-};
-
 TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
   
 	using namespace Catch::literals;
@@ -463,7 +445,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
 		
 		int rank = 0;
 		for(long nn = 1; nn <= maxsize; nn *= 3){
-			CHECK(gpu::run(gpu::reduce(nn), -232.8, ident{}) == Approx(-232.8 + (nn*(nn - 1.0)/2.0)));
+			CHECK(gpu::run(gpu::reduce(nn), -232.8, [] GPU_LAMBDA (auto ii) { return double(ii);} ) == Approx(-232.8 + (nn*(nn - 1.0)/2.0)));
 			rank++;
 		}
 	}
@@ -476,7 +458,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
 		for(long nx = 1; nx <= maxsize; nx *= 5){
 			for(long ny = 1; ny <= maxsize; ny *= 5){
 
-				auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23, prod{});
+				auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), 2.23,  [] GPU_LAMBDA (auto ix, auto iy) {return double(ix)*double(iy);});
 				
 				CHECK(typeid(decltype(res)) == typeid(double));
 				CHECK(res == Approx(2.23 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0));
@@ -495,7 +477,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
 			for(long ny = 1; ny <= maxsize; ny *= 5){
 				for(long nz = 1; nz <= maxsize; nz *= 5){
 					
-					auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, prod3{});
+					auto res = gpu::run(gpu::reduce(nx), gpu::reduce(ny), gpu::reduce(nz), 17.89, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);});
 					
 					CHECK(typeid(decltype(res)) == typeid(double));
 					CHECK(res == Approx(17.89 + nx*(nx - 1.0)/2.0*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0));
@@ -518,7 +500,7 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
 					
 				CHECK(typeid(decltype(res)) == typeid(gpu::array<double, 1>));
 				CHECK(res.size() == nx);
-				for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == -7.7 + double(ix)*ny*(ny - 1.0)/2.0);
+				for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(-7.7 + double(ix)*ny*(ny - 1.0)/2.0));
 				rank++;
 			}
 		}
@@ -534,12 +516,12 @@ TEST_CASE(GPURUN_TEST_FILE, GPURUN_TEST_TAG) {
 			for(long ny = 1; ny <= maxsize; ny *= 5){
 				for(long nz = 1; nz <= maxsize; nz *= 5){
 					
-					auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, prod3{});
+					auto res = gpu::run(nx, gpu::reduce(ny), gpu::reduce(nz), 10.0, [] GPU_LAMBDA (auto ix, auto iy, auto iz) {return double(ix)*double(iy)*double(iz);});
 					
 					CHECK(typeid(decltype(res)) == typeid(gpu::array<double, 1>));
 					
 					CHECK(res.size() == nx);
-					for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == 10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0);
+					for(long ix = 0; ix < nx; ix++) CHECK(res[ix] == Approx(10.0 + double(ix)*ny*(ny - 1.0)/2.0*nz*(nz - 1.0)/2.0));
 					rank++;
 				}
 			}

From becd8e81fa03624b9f32934d02db0dba6465ac76 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:21:05 -0700
Subject: [PATCH 4/8] Use a lambda in state_convergence.

---
 src/ground_state/calculator.hpp | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/ground_state/calculator.hpp b/src/ground_state/calculator.hpp
index 7362fba1..6394246a 100644
--- a/src/ground_state/calculator.hpp
+++ b/src/ground_state/calculator.hpp
@@ -68,16 +68,6 @@ class calculator {
 public:
 #endif
 
-	template <typename OccType, typename ArrayType>
-	struct state_conv_func {
-		OccType   occ;
-		ArrayType arr;
-		
-		GPU_FUNCTION double operator()(long ip) const {
-			return fabs(occ[ip]*arr[ip]);
-		}
-	};
-	
 	template <typename NormResType>
 	static double state_convergence(systems::electrons & el, NormResType const & normres) {
 		CALI_CXX_MARK_FUNCTION;
@@ -87,8 +77,9 @@ class calculator {
 		for(int iphi = 0; iphi < el.kpin_size(); iphi++){
 			assert(el.occupations()[iphi].size() == normres[iphi].size());
 
-			auto func = state_conv_func<decltype(begin(el.occupations()[iphi])), decltype(begin(normres[iphi]))>{begin(el.occupations()[iphi]), begin(normres[iphi])};
-			state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, func);
+			state_conv += gpu::run(gpu::reduce(normres[iphi].size()), 0.0, [occ = begin(el.occupations()[iphi]), arr = begin(normres[iphi])] GPU_LAMBDA (auto ip) {
+				return fabs(occ[ip]*arr[ip]);
+			});
 		}
 		
 		el.kpin_states_comm().all_reduce_n(&state_conv, 1);

From 10f80f2fbe8817049a6a4b51aebdbd61d3d1bdae Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:26:12 -0700
Subject: [PATCH 5/8] Use a lambda in occ_sum.

---
 src/hamiltonian/energy.hpp         | 15 +++------------
 src/hamiltonian/ks_hamiltonian.hpp | 15 +++------------
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/src/hamiltonian/energy.hpp b/src/hamiltonian/energy.hpp
index de948dc6..643aaf98 100644
--- a/src/hamiltonian/energy.hpp
+++ b/src/hamiltonian/energy.hpp
@@ -32,23 +32,14 @@ namespace hamiltonian {
 public:
 #endif
 		
-		template <typename OccType, typename ArrayType>
-		struct occ_sum_func {
-			OccType   occ;
-			ArrayType arr;
-
-			GPU_FUNCTION double operator()(long ip) const {
-				return occ[ip]*real(arr[ip]);
-			}
-		};
-		
 		template <typename OccType, typename ArrayType>
 		static double occ_sum(OccType const & occupations, ArrayType const & array) {
 			CALI_CXX_MARK_FUNCTION;
 			
 			assert(occupations.size() == array.size());
-			auto func = occ_sum_func<decltype(begin(occupations)), decltype(begin(array))>{begin(occupations), begin(array)};
-			return gpu::run(gpu::reduce(array.size()), 0.0, func);
+			return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) {
+				return occ[ip]*real(arr[ip]);
+			});
 		}
 
 public:
diff --git a/src/hamiltonian/ks_hamiltonian.hpp b/src/hamiltonian/ks_hamiltonian.hpp
index 8ce069dd..cc9d3721 100644
--- a/src/hamiltonian/ks_hamiltonian.hpp
+++ b/src/hamiltonian/ks_hamiltonian.hpp
@@ -54,23 +54,14 @@ class ks_hamiltonian {
 public:
 #endif
 		
-		template <typename OccType, typename ArrayType>
-		struct occ_sum_func {
-			OccType   occ;
-			ArrayType arr;
-
-			GPU_FUNCTION double operator()(long ip) const {
-				return occ[ip]*real(arr[ip]);
-			}
-		};
-		
 		template <typename OccType, typename ArrayType>
 		static double occ_sum(OccType const & occupations, ArrayType const & array) {
 			CALI_CXX_MARK_FUNCTION;
 			
 			assert(occupations.size() == array.size());
-			auto func = occ_sum_func<decltype(begin(occupations)), decltype(begin(array))>{begin(occupations), begin(array)};
-			return gpu::run(gpu::reduce(array.size()), 0.0, func);
+			return gpu::run(gpu::reduce(array.size()), 0.0, [occ = begin(occupations), arr = begin(array)] GPU_LAMBDA (auto ip) {
+				return occ[ip]*real(arr[ip]);
+			});
 		}
 
 public:

From 9e9c236fa81532f6873d04d9321a11f1ea4477a7 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:32:26 -0700
Subject: [PATCH 6/8] Use a lambda for the calculation of the non-local force.

---
 src/hamiltonian/projector_all.hpp | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp
index 4c2d3324..4ec55dbd 100644
--- a/src/hamiltonian/projector_all.hpp
+++ b/src/hamiltonian/projector_all.hpp
@@ -233,18 +233,6 @@ class projector_all {
 
 	////////////////////////////////////////////////////////////////////////////////////////////
 
-	template <typename OcType, typename PhiType, typename GPhiType>
-	struct force_term {
-		OcType oc;
-		PhiType phi;
-		GPhiType gphi;
-		constexpr auto operator()(int ist, int ip) const {
-			return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist]));
-		}
-	};
-
-	////////////////////////////////////////////////////////////////////////////////////////////
-
 	template <typename KpointType, typename Occupations>
 	double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {
     
@@ -387,7 +375,9 @@ class projector_all {
 			
 				CALI_CXX_MARK_SCOPE("projector_force_sum");
 				force[iproj] = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_sphere_size_), zero<vector3<double, covariant>>(),
-																force_term<decltype(begin(occs)), decltype(begin(sphere_phi_all[iproj])), decltype(begin(sphere_gphi_all[iproj]))>{begin(occs), begin(sphere_phi_all[iproj]), begin(sphere_gphi_all[iproj])});
+																[oc = begin(occs), phi = begin(sphere_phi_all[iproj]), gphi = begin(sphere_gphi_all[iproj])] GPU_LAMBDA (auto ist, auto ip) {
+																	return -2.0*oc[ist]*real(phi[ip][ist]*conj(gphi[ip][ist]));
+																});
 		}
 
 		for(auto iproj = 0; iproj < nprojs_; iproj++) {

From da1b88206e6f4b1797759f43df9591a729246f87 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:39:47 -0700
Subject: [PATCH 7/8] Use a lambda in the calculation of the local part of the
 forces.

---
 src/observables/forces_stress.hpp | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/observables/forces_stress.hpp b/src/observables/forces_stress.hpp
index ebc4690d..366f7895 100644
--- a/src/observables/forces_stress.hpp
+++ b/src/observables/forces_stress.hpp
@@ -19,18 +19,6 @@
 namespace inq {
 namespace observables {
 
-template <typename LongRangeType, typename ShortRangeType, typename GDensityType>
-struct loc_pot {
-	
-	LongRangeType v1;
-	ShortRangeType v2;
-	GDensityType gdensityp;
-	
-	GPU_FUNCTION auto operator()(long ip) const {
-		return (v1[ip] + v2[ip])*gdensityp[ip];
-	}
-};
-
 struct forces_stress {
 	gpu::array<vector3<double>, 1> forces;
 	gpu::array<double, 2>          stress;
@@ -45,7 +33,9 @@ struct forces_stress {
 		calculate(ions, electrons, ham);
 	}
 
+#ifndef ENABLE_GPU
 private:
+#endif
 	
 	template <typename HamiltonianType>
 	void calculate(const systems::ions & ions, systems::electrons const & electrons, HamiltonianType const & ham){
@@ -87,9 +77,10 @@ struct forces_stress {
 				auto ionic_short_range = electrons.atomic_pot().local_potential(electrons.states_comm(), electrons.density_basis(), ions, iatom);
 				
 				auto force_cov = -gpu::run(gpu::reduce(electrons.density_basis().local_size()), zero<vector3<double, inq::covariant>>(),
-																	 loc_pot<decltype(begin(ionic_long_range.linear())), decltype(begin(ionic_short_range.linear())), decltype(begin(gdensity.linear()))>
-																	 {begin(ionic_long_range.linear()), begin(ionic_short_range.linear()), begin(gdensity.linear())});
-				
+																	 [v1 = begin(ionic_long_range.linear()), v2 = begin(ionic_short_range.linear()), gdensityp = begin(gdensity.linear())] GPU_LAMBDA (auto ip) {
+																		 return (v1[ip] + v2[ip])*gdensityp[ip];
+																	 });
+
 				forces_local[iatom] = electrons.density_basis().volume_element()*ions.cell().metric().to_cartesian(force_cov);
 			}
 			

From 714271d399ed1ca1733d58ca01c17c66c129111d Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Tue, 22 Oct 2024 21:47:06 -0700
Subject: [PATCH 8/8] Use a lambda for the gpu::reduce in sum.

---
 src/operations/sum.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operations/sum.hpp b/src/operations/sum.hpp
index 75346623..58db9f33 100644
--- a/src/operations/sum.hpp
+++ b/src/operations/sum.hpp
@@ -28,7 +28,7 @@ typename array_type::element sum(const array_type & phi){
 	CALI_CXX_MARK_SCOPE("sum(1arg)");
 	auto init = zero<typename array_type::element>();
 	if(phi.size() == 0) return init;
-	return gpu::run(gpu::reduce(phi.size()), init, gpu::array_access<decltype(begin(phi))>{begin(phi)});
+	return gpu::run(gpu::reduce(phi.size()), init, [ph = begin(phi)] GPU_LAMBDA (auto ii) { return ph[ii]; });
 }
 
 template <class ArrayType, class UnaryOp>