From 06327876e83706d8eecba02a3f4d4695a786cff2 Mon Sep 17 00:00:00 2001
From: Marcin Wojdyr <wojdyr@gmail.com>
Date: Sun, 29 Oct 2023 20:06:08 +0100
Subject: [PATCH] use modified expapprox() from jhjourdan/SIMD-math-prims

for calculating sum of gaussians in in ExpSum<N, float>
and ExpAnisoSum<N, float>
---
 docs/hkl.rst               |  6 ++--
 include/gemmi/formfact.hpp | 62 ++++++++++++++++++++++++++++++++++++++
 tests/test_prog.py         |  4 +--
 3 files changed, 67 insertions(+), 5 deletions(-)
diff --git a/docs/hkl.rst b/docs/hkl.rst
index b980266d..af49d5a5 100644
--- a/docs/hkl.rst
+++ b/docs/hkl.rst
@@ -2127,7 +2127,7 @@ into a structure factor grid:
   >>> sf_grid
   <gemmi.ReciprocalComplexGrid(48, 50, 50)>
   >>> sf_grid.get_value(3, 4, 5)
-  (54.536354064941406+53.37517547607422j)
+  (54.53636932373047+53.37513732910156j)
 
 In addition to ``d_min`` and ``rate``, which govern the grid density,
 DensityCalculator has two more parameters that affect accuracy
@@ -2211,7 +2211,7 @@ We either multiply individual values by ``mott_bethe_factor()``
 .. doctest::
 
   >>> dc.mott_bethe_factor([3,4,5]) * grid.get_value(3,4,5)
-  (54.06309366876474+52.971226494748116j)
+  (54.06278054674026+52.97144710344719j)
 
 or we call ``prepare_asu_data()`` with ``mott_bethe=True``:
 
@@ -2220,7 +2220,7 @@ or we call ``prepare_asu_data()`` with ``mott_bethe=True``:
 
   >>> asu_data = grid.prepare_asu_data(dmin=2.5, mott_bethe=True, unblur=dencalc.blur)
   >>> asu_data.value_array[numpy.all(asu_data.miller_array == [3,4,5], axis=1)]
-  array([54.06309+52.971226j], dtype=complex64)
+  array([54.06278+52.971447j], dtype=complex64)
 
 That is all.
 If you would like to separate positions of hydrogen nuclei
diff --git a/include/gemmi/formfact.hpp b/include/gemmi/formfact.hpp
index a19fffcd..3c8374c2 100644
--- a/include/gemmi/formfact.hpp
+++ b/include/gemmi/formfact.hpp
@@ -13,6 +13,24 @@
 
 namespace gemmi {
 
+// NOTE: the argument x must be between -88 and 88.
+// It is based on expapprox() from
+// https://github.com/jhjourdan/SIMD-math-prims/blob/master/simd_math_prims.h
+// Relative error is below 1e-5.
+inline float unsafe_expapprox(float x) {
+  //static float zero = 0.f;  // non-const to disable optimization
+  float val = 12102203.1615614f * x + 1065353216.f;
+  //val = std::max(val, zero);  // check if x < -88.02969
+  int32_t vali = static_cast<std::int32_t>(val);
+  std::int32_t xu1 = vali & 0x7F800000;
+  std::int32_t xu2 = (vali & 0x7FFFFF) | 0x3F800000;
+  float a, b;
+  std::memcpy(&a, &xu1, 4);
+  std::memcpy(&b, &xu2, 4);
+  return a * (0.509871020f + b * (0.312146713f + b * (0.166617139f + b *
+          (-2.190619930e-3f + b * 1.3555747234e-2f))));
+}
+
 // precalculated density of an isotropic atom
 template<int N, typename Real>
 struct ExpSum {
@@ -37,6 +55,34 @@ struct ExpSum {
   }
 };
 
+template<int N>
+struct ExpSum<N, float> {
+  float a[N], b[N];
+  float calculate(float r2) const {
+    float density = 0;
+    float tmp[N];
+    for (int i = 0; i < N; ++i)
+      tmp[i] = std::max(b[i] * r2, -88.f);
+    for (int i = 0; i < N; ++i)
+      density += a[i] * unsafe_expapprox(tmp[i]);
+    return density;
+  }
+
+  std::pair<float,float> calculate_with_derivative(float r) const {
+    float density = 0;
+    float derivative = 0;
+    float tmp[N];
+    for (int i = 0; i < N; ++i)
+      tmp[i] = std::max(b[i] * (r * r), -88.f);
+    for (int i = 0; i < N; ++i) {
+      float y = a[i] * unsafe_expapprox(tmp[i]);
+      density += y;
+      derivative += 2 * b[i] * r * y;
+    }
+    return std::make_pair(density, derivative);
+  }
+};
+
 // precalculated density of an anisotropic atom
 template<int N, typename Real>
 struct ExpAnisoSum {
@@ -51,6 +97,22 @@ struct ExpAnisoSum {
   }
 };
 
+template<int N>
+struct ExpAnisoSum<N, float> {
+  float a[N];
+  SMat33<float> b[N];
+
+  float calculate(const Vec3& r) const {
+    float density = 0;
+    float tmp[N];
+    for (int i = 0; i < N; ++i)
+      tmp[i] = std::max((float)b[i].r_u_r(r), -88.f);
+    for (int i = 0; i < N; ++i)
+      density += a[i] * unsafe_expapprox(tmp[i]);
+    return density;
+  }
+};
+
 template<typename Real>
 constexpr Real pow15(Real x) { return x * std::sqrt(x); }
 
diff --git a/tests/test_prog.py b/tests/test_prog.py
index 105daf7a..cfaf9252 100755
--- a/tests/test_prog.py
+++ b/tests/test_prog.py
@@ -90,7 +90,7 @@ def test_sfcalc_5wkd(self):
         self.do('''\
 $ gemmi sfcalc --blur=12 --dmin=2.5 --rate=2.5 --rcut=1e-7 --test -v tests/5wkd.pdb
 [...]
-RMSE=2.5037e-05  3.544e-05%  max|dF|=0.0001476  R=0.000%  <dPhi>=3.694e-06
+RMSE=5.7890e-05  8.194e-05%  max|dF|=0.0001430  R=0.000%  <dPhi>=1.158e-05
 ''')  # noqa: E501
 
     @unittest.skipIf(sys.platform == 'win32', 'with MSVC it differs slightly')
@@ -98,7 +98,7 @@ def test_sfcalc_1pfe(self):
         self.do('''\
 $ gemmi sfcalc --dmin=9 --rate=4 --blur=60 --rcut=1e-7 --test -v tests/1pfe.cif.gz
 [...]
-RMSE=0.00049055  7.721e-05%  max|dF|=0.002941  R=0.000%  <dPhi>=1.895e-06
+RMSE=0.00050535  7.954e-05%  max|dF|=0.002944  R=0.000%  <dPhi>=2.615e-06
 ''')  # noqa: E501
 
     # example from utils.rst