From f0752d90572e64cd57c34035d157fbafa1a75d70 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Fri, 11 Oct 2024 13:40:31 -0700
Subject: [PATCH 1/3] Check that vector3 initialized to 0 with {}.

---
 external_libs/gpurun/include/gpu/reduce.hpp | 14 +++++---------
 src/math/vector3.hpp                        |  8 ++++++++
 2 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index 3534f9a7..a3c1deaa 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -12,6 +12,7 @@
 #include <inq_config.h>
 
 #include <cassert>
+#include <numeric>
 
 #include <gpu/run.hpp>
 #include <gpu/array.hpp>
@@ -80,15 +81,10 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 	auto const size = red.size;
 	
   using type = decltype(kernel(0));
-  
-#ifndef ENABLE_GPU
-
-  type accumulator(0.0);
-  for(long ii = 0; ii < size; ii++){
-    accumulator += kernel(ii);
-  }
-  return accumulator;
 
+#ifndef ENABLE_GPU
+	auto range = boost::multi::extension_t{0l, size};
+	return std::transform_reduce(range.begin(), range.end(), type{}, std::plus<>{}, kernel);
 #else
 
 	const int blocksize = 1024;
@@ -144,7 +140,7 @@ __global__ void reduce_kernel_rr(long sizex, long sizey, kernel_type kernel, arr
 #endif
 
 template <class kernel_type>
-auto run(reduce const & redx, reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {
+auto run(gpu::reduce const & redx, gpu::reduce const & redy, kernel_type kernel) -> decltype(kernel(0, 0)) {
 
 	auto const sizex = redx.size;	
 	auto const sizey = redy.size;	
diff --git a/src/math/vector3.hpp b/src/math/vector3.hpp
index 509b511c..5b1a29d2 100644
--- a/src/math/vector3.hpp
+++ b/src/math/vector3.hpp
@@ -345,6 +345,14 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG) {
 		vector3<int> vv; (void)vv;
 	}
 
+	SECTION("Zero by default constructor"){
+		vector3<int> vv{};
+		
+		CHECK(vv[0] == 0.0);
+		CHECK(vv[1] == 0.0);
+		CHECK(vv[2] == 0.0);
+	}
+
 	SECTION("Scalar"){
 		vector3<double> vv(-45.677);
 		

From 28df0460d44c2616b13cb217f150b13db449c9bd Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Fri, 11 Oct 2024 14:09:17 -0700
Subject: [PATCH 2/3] Use transform_reduce for the 1D reduction in gpu::reduce.

---
 external_libs/gpurun/include/gpu/reduce.hpp | 59 +++------------------
 1 file changed, 8 insertions(+), 51 deletions(-)

diff --git a/external_libs/gpurun/include/gpu/reduce.hpp b/external_libs/gpurun/include/gpu/reduce.hpp
index a3c1deaa..cdb40a16 100644
--- a/external_libs/gpurun/include/gpu/reduce.hpp
+++ b/external_libs/gpurun/include/gpu/reduce.hpp
@@ -12,7 +12,13 @@
 #include <inq_config.h>
 
 #include <cassert>
+
+#ifndef ENABLE_GPU
 #include <numeric>
+#else
+#include <thrust/execution_policy.h>
+#include <thrust/transform_reduce.h>
+#endif
 
 #include <gpu/run.hpp>
 #include <gpu/array.hpp>
@@ -27,40 +33,6 @@ struct reduce {
 	long size;
 };
 
-
-#ifdef ENABLE_GPU
-template <class kernel_type, class array_type>
-__global__ void reduce_kernel_r(long size, kernel_type kernel, array_type odata) {
-
-	extern __shared__ char shared_mem[];
-	auto reduction_buffer = (typename array_type::element *) shared_mem;
-	
-	// each thread loads one element from global to shared mem
-	unsigned int tid = threadIdx.x;
-	unsigned int ii = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if(ii < size){
-		reduction_buffer[tid] = kernel(ii);
-	} else {
-		reduction_buffer[tid] = (typename array_type::element) 0.0;
-	}
-
-	__syncthreads();
-
-	// do reduction in shared mem
-	for (unsigned int s = blockDim.x/2; s > 0; s >>= 1){
-		if (tid < s) {
-			reduction_buffer[tid] += reduction_buffer[tid + s];
-		}
-		__syncthreads();
-	}
-	
-	// write result for this block to global mem
-	if (tid == 0) odata[blockIdx.x] = reduction_buffer[0];
-
-}
-#endif
-
 template <typename array_type>
 struct array_access {
   array_type array;
@@ -81,27 +53,12 @@ auto run(reduce const & red, kernel_type kernel) -> decltype(kernel(0)) {
 	auto const size = red.size;
 	
   using type = decltype(kernel(0));
+	auto range = boost::multi::extension_t{0l, size};
 
 #ifndef ENABLE_GPU
-	auto range = boost::multi::extension_t{0l, size};
 	return std::transform_reduce(range.begin(), range.end(), type{}, std::plus<>{}, kernel);
 #else
-
-	const int blocksize = 1024;
-
-	unsigned nblock = (size + blocksize - 1)/blocksize;
-	gpu::array<type, 1> result(nblock);
-
-  reduce_kernel_r<<<nblock, blocksize, blocksize*sizeof(type)>>>(size, kernel, begin(result));	
-  check_error(last_error());
-	
-  if(nblock == 1) {
-    gpu::sync();
-    return result[0];
-  } else {
-    return run(gpu::reduce(nblock), array_access<decltype(begin(result))>{begin(result)});
-  }
-  
+	return thrust::transform_reduce(thrust::device, range.begin(), range.end(), kernel, type{}, std::plus<>{});
 #endif
 }
 

From 92b1f945e2114b7140c82293f8aacd987d4b4773 Mon Sep 17 00:00:00 2001
From: Xavier Andrade <xavier@tddft.org>
Date: Sun, 13 Oct 2024 02:14:48 -0700
Subject: [PATCH 3/3] Do not restrict the number of scf steps in the XC test
 calculation.

---
 src/hamiltonian/xc_term.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/hamiltonian/xc_term.hpp b/src/hamiltonian/xc_term.hpp
index 03bdbbcf..7eb87b7f 100644
--- a/src/hamiltonian/xc_term.hpp
+++ b/src/hamiltonian/xc_term.hpp
@@ -496,7 +496,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_unpolarized());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);
@@ -517,7 +517,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_polarized());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);
@@ -538,7 +538,7 @@ TEST_CASE(INQ_TEST_FILE, INQ_TEST_TAG){
 		ions.insert("H", {0.0_b, 0.0_b, 0.0_b});
 		auto electrons = systems::electrons(par, ions, options::electrons{}.cutoff(30.0_Ha).extra_states(2).spin_non_collinear());
 		ground_state::initial_guess(ions, electrons);
-		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha).max_steps(100));
+		auto result = ground_state::calculate(ions, electrons, options::theory{}.lda(), options::ground_state{}.steepest_descent().energy_tolerance(1.e-8_Ha));
 		auto nvxc = result.energy.nvxc();
 		auto exc = result.energy.xc();
 		Approx target = Approx(nvxc).epsilon(1.e-10);