diff --git a/ciderpress/data/__init__.py b/ciderpress/data/__init__.py
index 32776b3..c78a682 100644
--- a/ciderpress/data/__init__.py
+++ b/ciderpress/data/__init__.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import os
 
 import numpy as np
diff --git a/ciderpress/dft/density_util.py b/ciderpress/dft/density_util.py
index 52017fc..7a7c022 100644
--- a/ciderpress/dft/density_util.py
+++ b/ciderpress/dft/density_util.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import numpy as np
 
 
diff --git a/ciderpress/dft/lcao_convolutions.py b/ciderpress/dft/lcao_convolutions.py
index d823ea7..e43e805 100644
--- a/ciderpress/dft/lcao_convolutions.py
+++ b/ciderpress/dft/lcao_convolutions.py
@@ -102,7 +102,6 @@ def get_etb_from_expnt_range(
     ns = nmaxs - nmins + 1
     etb = []
     for l, n in enumerate(np.ceil(ns).astype(int)):
-        # print(l, n, emin_by_l[l], emax_by_l[l], beta)
         if n > 0 and l <= lmax:
             etb.append((l, n, emin_by_l[l], beta))
     return etb
diff --git a/ciderpress/dft/lcao_interpolation.py b/ciderpress/dft/lcao_interpolation.py
index 2952b74..e28e06a 100644
--- a/ciderpress/dft/lcao_interpolation.py
+++ b/ciderpress/dft/lcao_interpolation.py
@@ -21,6 +21,7 @@
 import ctypes
 
 import numpy as np
+from pyscf import lib as pyscflib
 
 from ciderpress import lib
 from ciderpress.dft.lcao_convolutions import (
@@ -375,6 +376,14 @@ def natm(self):
         return self.atco.natm
 
     def _set_num_ai(self, all_coords):
+        """
+        This function computes the number of coordinates in
+        all_coords that fall in each spline "box" for the radial
+        interpolation grid on each atom. The result is then
+        used to create the _loc_ai member, which in turn
+        is used by the _compute_sline_ind_order function
+        to order grid coordinates by their spline box on a given atom.
+        """
         if all_coords is None:
             assert self.is_num_ai_setup
             return
@@ -424,6 +433,12 @@ def _set_num_ai(self, all_coords):
         self.is_num_ai_setup = True
 
     def _compute_spline_ind_order(self, a):
+        """
+        Assuming _set_num_ai has been called previously, order
+        self.all_coords such that each coordinate is in increasing
+        order of spline index for the radial interpolation grid
+        on atom a.
+        """
         if not self.is_num_ai_setup:
             raise RuntimeError
         ngrids_tot = self.all_coords.shape[0]
@@ -451,6 +466,12 @@ def _compute_spline_ind_order(self, a):
         return self._coords_ord, self._ind_ord_fwd
 
     def _eval_spline_bas_single(self, a):
+        """
+        Note that _set_num_ai must have been called previously, because
+        it is required for _compute_spline_ind_order to work, which
+        is called by this function. TODO might be better to have a cleaner
+        solution for this algorithm.
+        """
         self._compute_spline_ind_order(a)
         ngrids = self._coords_ord.shape[0]
         if self.onsite_direct:
@@ -585,7 +606,7 @@ def _contract_grad_terms(self, excsum, f_g, a, v):
         assert iatom_list is not None
         assert iatom_list.flags.c_contiguous
         ngrids = iatom_list.size
-        libcider.contract_grad_terms2(
+        libcider.contract_grad_terms_parallel(
             excsum.ctypes.data_as(ctypes.c_void_p),
             f_g.ctypes.data_as(ctypes.c_void_p),
             ctypes.c_int(self.atco.natm),
@@ -637,8 +658,7 @@ def _interpolate_nopar_atom_deriv(self, f_arlpq, f_gq):
                     args[3] = auxo_vgp[0].ctypes.data_as(ctypes.c_void_p)
                     fn(*args)
                     self._call_l1_fill(ftmp_gq, self.atom_coords[a], True)
-                    # TODO accelerate since this step will be done many times
-                    ftmp = np.einsum("gq,gq->g", ftmp_gq, f_gq)
+                    ftmp = pyscflib.einsum("gq,gq->g", ftmp_gq, f_gq)
                     self._contract_grad_terms(excsum, ftmp, a, v)
         return excsum
 
@@ -695,7 +715,6 @@ def project_orb2grid(self, f_uq, f_gq=None):
         Args:
             f_uq:
             f_gq:
-            spline_buf: Must be all zeros (TODO this is unsafe)
 
         Returns:
 
@@ -834,7 +853,6 @@ def project_orb2grid(self, f_uq, f_gq=None):
         Args:
             f_uq:
             f_gq:
-            spline_buf: Must be all zeros (TODO this is unsafe)
 
         Returns:
 
diff --git a/ciderpress/dft/model_utils.py b/ciderpress/dft/model_utils.py
index 4d24277..fffe697 100644
--- a/ciderpress/dft/model_utils.py
+++ b/ciderpress/dft/model_utils.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import joblib
 import yaml
 
diff --git a/ciderpress/dft/plans.py b/ciderpress/dft/plans.py
index 72f86f4..8889ba6 100644
--- a/ciderpress/dft/plans.py
+++ b/ciderpress/dft/plans.py
@@ -2055,7 +2055,6 @@ def get_a2q_fast(self, exp_g):
         """A fast (parallel C-backend) version of get_a2q that
         also has a local option to get the q values just
         for the local alphas."""
-        # TODO add bounds checking
         di = np.empty_like(exp_g)
         derivi = np.empty_like(exp_g)
         if self.alpha_formula == "etb":
diff --git a/ciderpress/dft/pwutil.py b/ciderpress/dft/pwutil.py
index 8f9f8cb..6da56a4 100644
--- a/ciderpress/dft/pwutil.py
+++ b/ciderpress/dft/pwutil.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import ctypes
 
 import numpy as np
diff --git a/ciderpress/dft/settings.py b/ciderpress/dft/settings.py
index aedc23f..1be4336 100644
--- a/ciderpress/dft/settings.py
+++ b/ciderpress/dft/settings.py
@@ -72,6 +72,9 @@ def _get_fl_ueg(s):
 def get_cider_exponent(
     rho, sigma, tau, a0=1.0, grad_mul=0.0, tau_mul=0.03125, rhocut=1e-10, nspin=1
 ):
+    """
+    Evaluate an NLDF length-scale exponent at the MGGA level.
+    """
     if nspin > 2:
         raise ValueError
     if isinstance(rho, np.ndarray):
@@ -113,6 +116,9 @@ def get_cider_exponent(
 
 
 def get_cider_exponent_gga(rho, sigma, a0=1.0, grad_mul=0.03125, rhocut=1e-10, nspin=1):
+    """
+    Evaluate an NLDF length-scale exponent at the GGA level.
+    """
     if nspin > 2:
         raise ValueError
     if isinstance(rho, np.ndarray):
@@ -161,6 +167,13 @@ def _get_ueg_expnt(aval, tval, rho):
 
 
 class BaseSettings(ABC):
+    """
+    This is a base class for storing the settings for different
+    types of density/density matrix feature in CiderPress.
+    Settings objects indicate which features must be evaluated,
+    and with which hyperparameters, to use as input to an ML functional.
+    """
+
     @property
     @abstractmethod
     def nfeat(self):
@@ -171,6 +184,9 @@ def nfeat(self):
 
     @property
     def is_empty(self):
+        """
+        Return true of this settings object specifies zero features.
+        """
         return self.nfeat == 0
 
     @abstractmethod
@@ -201,6 +217,14 @@ def get_reasonable_normalizer(self):
 
 
 class EmptySettings(BaseSettings):
+    """
+    The EmptySettings class is a representation of a feature set containing
+    zero features. It is used when a certain type of feature is not
+    present in a model. (For example, if a model does not use SDMX
+    features, that model's FeatureSettings.sdmx_settings will be
+    an EmptySettings instance.)
+    """
+
     @property
     def nfeat(self):
         return 0
@@ -509,8 +533,6 @@ def __init__(self, pows):
                 and features might be poorly defined/numerically inaccurate.
         """
         super(SDMXSettings, self).__init__("smooth")
-        # if mode not in ['diffse', 'r2', 'r4']:
-        #    raise ValueError('Mode must be exact or smooth, got {}'.format(mode))
         self.pows = pows
 
     @property
@@ -948,14 +970,26 @@ def get_reasonable_normalizer(self):
 See ``ALLOWED_J_SPECS``
 """
 
-ALLOWED_RHO_MULTS = ["one", "taumix", "dampmix", "expnt"]
+ALLOWED_RHO_MULTS = ["one", "expnt"]
 """
-TODO docs here
+These strings specify the allowed options for what value
+to multiply the density by before integrating it to construct
+NLDF features. The options are:
+
+one: Identity, i.e. multiply density by 1
+
+expnt: Multiply the density by the NLDF exponent specified
+by the theta_params. (NOTE: Experimental, not thoroughly tested.)
 """
 
-ALLOWED_RHO_DAMPS = ["none", "exponential", "asymptotic_const"]
+ALLOWED_RHO_DAMPS = ["exponential"]
 """
-TODO docs here
+These strings specify the allowed options for how to "damp"
+the density for the version k features. Currently the only allowed
+option is "exponential", which results in the integral
+:math:`\\int g[n](|r-r'|) n(r') exp(-3 a_0[n](r') / 2 a_i[n](r))`,
+where :math:`a_0` is the exponent given by ``theta_params``
+and :math:`a_i` is an exponet given by ``feat_params``.
 """
 
 SPEC_USPS = {
@@ -980,8 +1014,6 @@ def get_reasonable_normalizer(self):
 """
 RHO_MULT_USPS = {
     "one": 0,
-    "taumix": 0,
-    "dampmix": 0,
     "expnt": 2,
 }
 
@@ -1016,7 +1048,7 @@ def __init__(
                 Should be an array of 3 floats [a0, grad_mul, tau_mul].
                 tau_mul is ignored if sl_level="GGA" and may therefore be excluded.
             rho_mult (str): Multiply the density that gets integrated
-                by a prefactor. Options: None/'one', 'taumix', 'dampmix', 'expnt'
+                by a prefactor. Options: See ALLOWED_RHO_MULTS.
         """
         self._sl_level = sl_level
         self.theta_params = theta_params
@@ -1130,7 +1162,7 @@ def __init__(
                 'exponent' is not used in the squared-exponential but rather
                 within the density damping scheme (see rho_damp).
             rho_mult (str): Multiply the density that gets integrated
-                by a prefactor. Options: None/'one', 'taumix', 'dampmix', 'expnt'
+                by a prefactor. Options: See ALLOWED_RHO_MULTS.
             l0_feat_specs (list of str): Each item in the list is a str
                 specifying the formula to be used for the scalar (l=0)
                 features. See ALLOWED_I_SPECS_L0 for allowed values.
@@ -1264,7 +1296,7 @@ def __init__(
                 'exponent' is not used in the squared-exponential but rather
                 within the density damping scheme (see rho_damp).
             rho_mult (str): Multiply the density that gets integrated
-                by a prefactor. Options: None/'one', 'taumix', 'dampmix', 'expnt'
+                by a prefactor. Options: See ALLOWED_RHO_MULTS.
             feat_specs (list of str):
                 Each item in the list is a string specifying the formula
                 to be used for a feature (see ALLOWED_J_SPECS for options).
@@ -1392,7 +1424,7 @@ def __init__(
                 'exponent' is not used in the squared-exponential but rather
                 within the density damping scheme (see rho_damp).
             rho_mult (str): Multiply the density that gets integrated
-                by a prefactor. Options: None/'one', 'taumix', 'dampmix', 'expnt'
+                by a prefactor. Options: See ALLOWED_RHO_MULTS.
             l0_feat_specs_i (list of str): Each item in the list is a str
                 specifying the formula to be used for the scalar (l=0)
                 features. See ALLOWED_I_SPECS_L0 for allowed values.
@@ -1504,7 +1536,7 @@ def __init__(
                 'exponent' is not used in the squared-exponential but rather
                 within the density damping scheme (see rho_damp).
             rho_mult (str): Multiply the density that gets integrated
-                by a prefactor. Options: None/'one', 'taumix', 'dampmix', 'expnt'
+                by a prefactor. Options: See ALLOWED_RHO_MULTS.
             feat_params (list of np.ndarray):
                 Each item in the list is an array with the parameters for the
                 feature corresponding to the feat_specs above. Typically, each array
@@ -1522,6 +1554,8 @@ def __init__(
         for s, p in zip(self.feat_specs, self.feat_params):
             self._check_params(p, spec=s)
         self.rho_damp = rho_damp
+        if self.rho_damp not in ALLOWED_RHO_DAMPS:
+            raise ValueError("rho_damp argument must be in ALLOWED_RHO_DAMPS.")
 
     @property
     def num_feat_param_sets(self):
diff --git a/ciderpress/dft/sph_harm_coeff.py b/ciderpress/dft/sph_harm_coeff.py
index 52a0fa0..71adade 100644
--- a/ciderpress/dft/sph_harm_coeff.py
+++ b/ciderpress/dft/sph_harm_coeff.py
@@ -18,8 +18,6 @@
 # Author: Kyle Bystrom <kylebystrom@gmail.com>
 #
 
-import ctypes
-
 import numpy as np
 from sympy.physics.wigner import clebsch_gordan
 
@@ -38,9 +36,8 @@ def change_basis_real_to_complex(l: int) -> np.ndarray:
     for m in range(1, l + 1):
         q[l + m, l + abs(m)] = (-1) ** m / np.sqrt(2)
         q[l + m, l - abs(m)] = 1j * (-1) ** m / np.sqrt(2)
-    return (
-        -1j
-    ) ** l * q  # Added factor of 1j**l to make the Clebsch-Gordan coefficients real
+    # Added factor of 1j**l to make the Clebsch-Gordan coefficients real
+    return (-1j) ** l * q
 
 
 def su2_clebsch_gordan(l1, l2, l3):
@@ -81,74 +78,6 @@ def clebsch_gordan_e3nn(l1: int, l2: int, l3: int) -> np.ndarray:
     return np.real(C)
 
 
-def get_ylm(r, lmax):
-    N = r.shape[0]
-    nlm = (lmax + 1) * (lmax + 1)
-    rnorm = np.linalg.norm(r, axis=1)
-    rhat = r / rnorm[:, np.newaxis]
-    ylm = np.zeros((N, nlm))
-    libcider.recursive_sph_harm_vec(
-        ctypes.c_int(nlm),
-        ctypes.c_int(N),
-        rhat.ctypes.data_as(ctypes.c_void_p),
-        ylm.ctypes.data_as(ctypes.c_void_p),
-    )
-    xyz = ylm[:, 1:4].copy()
-    ylm[:, 3] = xyz[:, 0]
-    ylm[:, 1] = xyz[:, 1]
-    ylm[:, 2] = xyz[:, 2]
-    for l in range(lmax + 1):
-        for m in range(2 * l + 1):
-            lm = l * l + m
-            ylm[:, lm] *= rnorm**l
-    return ylm
-
-
-def get_ylm_fd(i, r, lmax, delta=1e-6):
-    rx = r.copy()
-    rx[:, i] += 0.5 * delta
-    ylmp = get_ylm(rx, lmax)
-    rx[:, i] -= delta
-    ylmm = get_ylm(rx, lmax)
-    dylm = (ylmp - ylmm) / delta
-    return dylm
-
-
-def get_ylm_ad(r, lmax):
-    nlm = (lmax + 1) * (lmax + 1)
-    rnorm = np.linalg.norm(r, axis=1)
-    rhat = r / rnorm[:, np.newaxis]
-    N = r.shape[0]
-    print("N", N)
-    ylm = np.zeros((N, nlm))
-    dylm_tmp = np.zeros((N, 3, nlm))
-    dylm = np.empty((4, N, nlm))
-    libcider.recursive_sph_harm_deriv_vec(
-        ctypes.c_int(nlm),
-        ctypes.c_int(N),
-        rhat.ctypes.data_as(ctypes.c_void_p),
-        ylm.ctypes.data_as(ctypes.c_void_p),
-        dylm_tmp.ctypes.data_as(ctypes.c_void_p),
-    )
-    dylm[0] = ylm
-    dylm[1:] = dylm_tmp.transpose(1, 0, 2)
-    subval = np.einsum("vgl,gv->gl", dylm[1:], rhat)
-    print("SUBVAL", np.linalg.norm(subval))
-    for l in range(lmax + 1):
-        rlm1 = rnorm ** (l - 1)
-        rl = rnorm**l
-        for m in range(2 * l + 1):
-            lm = l * l + m
-            dylm[0, :, lm] *= rl
-            dylm[1:, :, lm] *= rlm1
-            dylm[1:, :, lm] += ylm[:, lm] * l * rlm1 * rhat.T
-    xyz = dylm[..., 1:4].copy()
-    dylm[..., 3] = xyz[..., 0]
-    dylm[..., 1] = xyz[..., 1]
-    dylm[..., 2] = xyz[..., 2]
-    return dylm
-
-
 def get_deriv_ylm_coeff(lmax):
     nlm = (lmax + 1) * (lmax + 1)
     gaunt_coeff = np.zeros((5, nlm))
@@ -167,190 +96,3 @@ def get_deriv_ylm_coeff(lmax):
                 ((2.0 * l + 3) * l1m * (l + 1 + m) / (2 * l + 1))
             )
     return gaunt_coeff
-
-
-def get_deriv_ylm_coeff_v2(lmax):
-    nlm = (lmax + 1) * (lmax + 1)
-    gaunt_coeff = np.zeros((5, nlm))
-    for l in range(lmax + 1):
-        fe = clebsch_gordan_e3nn(l, 1, l + 1)
-        for im in range(2 * l + 1):
-            lm = l * l + im
-            m = abs(im - l)
-            l1m = l + 1 - m
-            fac = ((2 * l + 3) * (l + 1)) ** 0.5
-            gaunt_coeff[0, lm] = fac * fe[im, 2, im]
-            gaunt_coeff[1, lm] = fac * fe[im, 2, im + 2]
-            gaunt_coeff[2, lm] = fac * fe[im, 0, 2 * l - im]
-            gaunt_coeff[3, lm] = fac * fe[im, 0, 2 * l - im + 2]
-            # gaunt_coeff[:4, lm] *= np.sqrt(2 * m + 1) * (-1)**l
-            gaunt_coeff[4, lm] = np.sqrt(
-                ((2.0 * l + 3) * l1m * (l + 1 + m) / (2 * l + 1))
-            )
-    return gaunt_coeff
-
-
-if __name__ == "__main__":
-    from numpy.testing import assert_allclose, assert_almost_equal
-
-    np.random.seed(32)
-    lmax = 10
-    N = 15
-    nlm = (lmax + 1) * (lmax + 1)
-    r = np.random.normal(size=(N, 3))
-    dylm_ref = np.zeros((3, N, nlm))
-    ylm0 = get_ylm(r, lmax)
-    for i in range(3):
-        dylm_ref[i] = get_ylm_fd(i, r, lmax)
-    dylm_ref2 = get_ylm_ad(r, lmax)
-    for lm in range(0, nlm):
-        assert_allclose(ylm0[:, lm], dylm_ref2[0, :, lm], rtol=1e-6, atol=1e-5)
-        assert_allclose(dylm_ref[0, :, lm], dylm_ref2[1, :, lm], rtol=1e-6, atol=1e-5)
-        assert_allclose(dylm_ref[1, :, lm], dylm_ref2[2, :, lm], rtol=1e-6, atol=1e-5)
-        assert_allclose(dylm_ref[2, :, lm], dylm_ref2[3, :, lm], rtol=1e-6, atol=1e-5)
-
-    import time
-
-    t0 = time.monotonic()
-    gaunt_coeff = get_deriv_ylm_coeff(lmax)
-    t1 = time.monotonic()
-    print("TIME TO GEN COEFF", t1 - t0)
-    print("GAUNT", gaunt_coeff)
-    ylm0_fit = np.zeros((4, N, nlm))
-    igrad = 0
-    for l in range(lmax):
-        mmax = 2 * l + 1
-        for m in range(2 * l + 1):
-            """
-            if igrad == 0:
-                lm = l * l + m
-            else:
-                lm = (l+1) * (l+1) - 1 - m
-            lm0 = lm + 2 * l + 1
-            lm1 = lm + 2 * l + 3
-            ylm0_fit[0, :, lm0] = ylm0[:, lm]
-            ylm0_fit[1, :, lm1] = ylm0[:, lm]
-            """
-            lm = l * l + m
-            if igrad == 0:
-                lmt = lm
-            else:
-                lmt = (l + 1) * (l + 1) - 1 - m
-            lm0 = lmt + 2 * l + 1
-            lm1 = lmt + 2 * l + 3
-            # lm2 = lmh + 2 * l + 1
-            # lm3 = lmh + 2 * l + 3
-            if igrad == 0:
-                ylm0_fit[0, :, lm0] = ylm0[:, lm]
-                ylm0_fit[1, :, lm1] = ylm0[:, lm]
-            else:
-                ylm0_fit[0, :, lm0] = ylm0[:, lm]
-                ylm0_fit[1, :, lm1] = ylm0[:, lm]
-    for l in range(lmax + 1):
-        mmax = 2 * l + 1
-        for m in range(2 * l + 1):
-            lm = l * l + m
-            X = ylm0_fit[:, :, lm].T
-            X = []
-            inds = []
-            mm = m - l
-            am = abs(mm)
-            f1 = (
-                clebsch_gordan(l - 1, 1, l, mm - 1, 1, mm).n(64)
-                * ((2 * l + 1) * l) ** 0.5
-                / (2) ** 0.5
-            )
-            f2 = (
-                clebsch_gordan(l - 1, 1, l, mm + 1, -1, mm).n(64)
-                * ((2 * l + 1) * l) ** 0.5
-                / (2) ** 0.5
-                * -1
-            )
-            if l != 0:
-                fe = clebsch_gordan_e3nn(l - 1, 1, l)
-                print(mm, m, l, fe.shape)
-                # print(fe)
-                fac = ((2 * l + 1) * l) ** 0.5
-                if igrad == 0:
-                    igg = 2
-                    mmm = m
-                    m1m = m - 2
-                    m1p = m
-                elif igrad == 1:
-                    igg = 0
-                    mmm = m
-                    # fac *= -1
-                    m1p = 2 * l - m - 2
-                    m1m = 2 * l - m
-                else:
-                    igg = 1
-                    mmm = m
-                    m1m = m - 1
-                    m1p = m - 1
-                # igg = 2 if igrad == 0 else 0
-                # mmm = m if igrad == 0 else 2 * l - m
-                fxp = fe[m1p, igg, mmm] if mm < l - 1 else 0
-                fxp *= fac
-                fxm = fe[m1m, igg, mmm] if mm > -l + 1 else 0
-                fxm *= fac
-            else:
-                fxp = fxm = 0
-            if mm == 0:
-                f1 *= np.sqrt(2)
-                f2 *= np.sqrt(2)
-            # print('FACTORC', float(f1), float(f2))
-            print("FACTORB", fxp, fxm)
-            if igrad == 1:
-                mvals = [
-                    mm + 1,
-                    mm - 1,
-                ]
-            else:
-                mvals = [
-                    -mm - 1,
-                    -mm + 1,
-                ]
-            # print('MVALS', mvals)
-            for i in range(2):
-                # print('MM', m, mm, mvals[i])
-                if abs(mvals[i]) >= l:
-                    continue
-                if np.linalg.norm(ylm0_fit[i, :, lm]) == 0:
-                    continue
-                for x in X:
-                    if np.linalg.norm(x - ylm0_fit[i, :, lm]) < 1e-6:
-                        break
-                else:
-                    inds.append((i, mvals[i]))
-                    X.append(ylm0_fit[i, :, lm])
-            if len(X) == 0:
-                continue
-            X = np.array(X).T
-            y = dylm_ref[igrad, :, lm]
-            # print(X.shape, y.shape)
-            beta = np.linalg.inv(X.T.dot(X) + 1e-10 * np.identity(X.shape[1]))
-            beta = beta.dot(X.T.dot(y))
-            err = np.linalg.norm(X.dot(beta) - y)
-            assert_almost_equal(err, 0, 4)
-            print(l, m - l, inds, lm, np.round(beta, 6), err)
-            # print(gaunt_coeff[2, lm], err)
-    for lm in range(nlm):
-        c_xl = np.zeros((N, nlm))
-        c_xl[:, lm] = 1
-        dylm_pred = np.zeros((3, N, nlm))
-        libcider.fill_sph_harm_deriv_coeff(
-            c_xl.ctypes.data_as(ctypes.c_void_p),
-            dylm_pred.ctypes.data_as(ctypes.c_void_p),
-            gaunt_coeff.ctypes.data_as(ctypes.c_void_p),
-            ctypes.c_int(N),
-            ctypes.c_int(lmax),
-        )
-        dylm_pred = (dylm_pred * ylm0).sum(-1)
-        print(lm)
-        # print(r[:, 2])
-        # print(ylm0[:, lm])
-        # print(dylm_pred[2])
-        # print(dylm_ref[2, :, lm])
-        assert_almost_equal(dylm_pred[2], dylm_ref2[3, :, lm], 6)
-        assert_almost_equal(dylm_pred[0], dylm_ref2[1, :, lm], 6)
-        assert_almost_equal(dylm_pred[1], dylm_ref2[2, :, lm], 6)
diff --git a/ciderpress/dft/tests/test_baselines.py b/ciderpress/dft/tests/test_baselines.py
index 952e508..70e8918 100644
--- a/ciderpress/dft/tests/test_baselines.py
+++ b/ciderpress/dft/tests/test_baselines.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/dft/tests/test_interpolation.py b/ciderpress/dft/tests/test_interpolation.py
index d4b332f..f26c95b 100644
--- a/ciderpress/dft/tests/test_interpolation.py
+++ b/ciderpress/dft/tests/test_interpolation.py
@@ -211,7 +211,6 @@ def get_e_and_vfeat(f_arlpq):
             df_uq[:, q] = 1e-8 * np.random.normal(size=atco.nao)
             fperturb_pred_arlpq = interpolator.conv2spline(f_uq + df_uq)
             energy_perturb, _ = get_e_and_vfeat(fperturb_pred_arlpq)
-            # print(energy, energy_perturb)
             fd_deriv = energy_perturb - energy
             assert_allclose(np.sum(vf_uq * df_uq), fd_deriv, rtol=2e-4)
         n = 194
diff --git a/ciderpress/dft/tests/test_sph_harm_coeff.py b/ciderpress/dft/tests/test_sph_harm_coeff.py
new file mode 100644
index 0000000..96ab7a7
--- /dev/null
+++ b/ciderpress/dft/tests/test_sph_harm_coeff.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
+"""
+The purpose of these tests is to make sure that all the spherical
+harmonic derivative coefficients are working correctly.
+"""
+
+import ctypes
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose, assert_almost_equal
+
+from ciderpress.dft.sph_harm_coeff import (
+    clebsch_gordan,
+    clebsch_gordan_e3nn,
+    get_deriv_ylm_coeff,
+    libcider,
+)
+
+
+def get_ylm(r, lmax):
+    N = r.shape[0]
+    nlm = (lmax + 1) * (lmax + 1)
+    rnorm = np.linalg.norm(r, axis=1)
+    rhat = r / rnorm[:, np.newaxis]
+    ylm = np.zeros((N, nlm))
+    libcider.recursive_sph_harm_vec(
+        ctypes.c_int(nlm),
+        ctypes.c_int(N),
+        rhat.ctypes.data_as(ctypes.c_void_p),
+        ylm.ctypes.data_as(ctypes.c_void_p),
+    )
+    for l in range(lmax + 1):
+        for m in range(2 * l + 1):
+            lm = l * l + m
+            ylm[:, lm] *= rnorm**l
+    return ylm
+
+
+def get_ylm_fd(i, r, lmax, delta=1e-6):
+    rx = r.copy()
+    rx[:, i] += 0.5 * delta
+    ylmp = get_ylm(rx, lmax)
+    rx[:, i] -= delta
+    ylmm = get_ylm(rx, lmax)
+    dylm = (ylmp - ylmm) / delta
+    return dylm
+
+
+def get_ylm_ad(r, lmax):
+    nlm = (lmax + 1) * (lmax + 1)
+    rnorm = np.linalg.norm(r, axis=1)
+    rhat = r / rnorm[:, np.newaxis]
+    N = r.shape[0]
+    ylm = np.zeros((N, nlm))
+    dylm_tmp = np.zeros((N, 3, nlm))
+    dylm = np.empty((4, N, nlm))
+    libcider.recursive_sph_harm_deriv_vec(
+        ctypes.c_int(nlm),
+        ctypes.c_int(N),
+        rhat.ctypes.data_as(ctypes.c_void_p),
+        ylm.ctypes.data_as(ctypes.c_void_p),
+        dylm_tmp.ctypes.data_as(ctypes.c_void_p),
+    )
+    dylm[0] = ylm
+    dylm[1:] = dylm_tmp.transpose(1, 0, 2)
+    for l in range(lmax + 1):
+        rlm1 = rnorm ** (l - 1)
+        rl = rnorm**l
+        for m in range(2 * l + 1):
+            lm = l * l + m
+            dylm[0, :, lm] *= rl
+            dylm[1:, :, lm] *= rlm1
+            dylm[1:, :, lm] += ylm[:, lm] * l * rlm1 * rhat.T
+    return dylm
+
+
+class TestYlm(unittest.TestCase):
+    def test_ylm(self):
+        np.random.seed(32)
+        lmax = 10
+        N = 15
+        nlm = (lmax + 1) * (lmax + 1)
+        r = np.random.normal(size=(N, 3))
+        dylm_ref = np.zeros((3, N, nlm))
+        ylm0 = get_ylm(r, lmax)
+        for i in range(3):
+            dylm_ref[i] = get_ylm_fd(i, r, lmax)
+        dylm_ref2 = get_ylm_ad(r, lmax)
+        for lm in range(0, nlm):
+            assert_allclose(ylm0[:, lm], dylm_ref2[0, :, lm], rtol=1e-6, atol=1e-5)
+            assert_allclose(
+                dylm_ref[0, :, lm], dylm_ref2[1, :, lm], rtol=1e-6, atol=1e-5
+            )
+            assert_allclose(
+                dylm_ref[1, :, lm], dylm_ref2[2, :, lm], rtol=1e-6, atol=1e-5
+            )
+            assert_allclose(
+                dylm_ref[2, :, lm], dylm_ref2[3, :, lm], rtol=1e-6, atol=1e-5
+            )
+
+        gaunt_coeff = get_deriv_ylm_coeff(lmax)
+        ylm0_fit = np.zeros((4, N, nlm))
+        igrad = 0
+        for l in range(lmax):
+            2 * l + 1
+            for m in range(2 * l + 1):
+                lm = l * l + m
+                if igrad == 0:
+                    lmt = lm
+                else:
+                    lmt = (l + 1) * (l + 1) - 1 - m
+                lm0 = lmt + 2 * l + 1
+                lm1 = lmt + 2 * l + 3
+                if igrad == 0:
+                    ylm0_fit[0, :, lm0] = ylm0[:, lm]
+                    ylm0_fit[1, :, lm1] = ylm0[:, lm]
+                else:
+                    ylm0_fit[0, :, lm0] = ylm0[:, lm]
+                    ylm0_fit[1, :, lm1] = ylm0[:, lm]
+        for l in range(lmax + 1):
+            2 * l + 1
+            for m in range(2 * l + 1):
+                lm = l * l + m
+                X = ylm0_fit[:, :, lm].T
+                X = []
+                inds = []
+                mm = m - l
+                abs(mm)
+                f1 = (
+                    clebsch_gordan(l - 1, 1, l, mm - 1, 1, mm).n(64)
+                    * ((2 * l + 1) * l) ** 0.5
+                    / (2) ** 0.5
+                )
+                f2 = (
+                    clebsch_gordan(l - 1, 1, l, mm + 1, -1, mm).n(64)
+                    * ((2 * l + 1) * l) ** 0.5
+                    / (2) ** 0.5
+                    * -1
+                )
+                if l != 0:
+                    fe = clebsch_gordan_e3nn(l - 1, 1, l)
+                    fac = ((2 * l + 1) * l) ** 0.5
+                    if igrad == 0:
+                        igg = 2
+                        mmm = m
+                        m1m = m - 2
+                        m1p = m
+                    elif igrad == 1:
+                        igg = 0
+                        mmm = m
+                        m1p = 2 * l - m - 2
+                        m1m = 2 * l - m
+                    else:
+                        igg = 1
+                        mmm = m
+                        m1m = m - 1
+                        m1p = m - 1
+                    fxp = fe[m1p, igg, mmm] if mm < l - 1 else 0
+                    fxp *= fac
+                    fxm = fe[m1m, igg, mmm] if mm > -l + 1 else 0
+                    fxm *= fac
+                else:
+                    fxp = fxm = 0
+                if mm == 0:
+                    f1 *= np.sqrt(2)
+                    f2 *= np.sqrt(2)
+                if igrad == 1:
+                    mvals = [
+                        mm + 1,
+                        mm - 1,
+                    ]
+                else:
+                    mvals = [
+                        -mm - 1,
+                        -mm + 1,
+                    ]
+                for i in range(2):
+                    if abs(mvals[i]) >= l:
+                        continue
+                    if np.linalg.norm(ylm0_fit[i, :, lm]) == 0:
+                        continue
+                    for x in X:
+                        if np.linalg.norm(x - ylm0_fit[i, :, lm]) < 1e-6:
+                            break
+                    else:
+                        inds.append((i, mvals[i]))
+                        X.append(ylm0_fit[i, :, lm])
+                if len(X) == 0:
+                    continue
+                X = np.array(X).T
+                y = dylm_ref[igrad, :, lm]
+                beta = np.linalg.inv(X.T.dot(X) + 1e-10 * np.identity(X.shape[1]))
+                beta = beta.dot(X.T.dot(y))
+                err = np.linalg.norm(X.dot(beta) - y)
+                assert_almost_equal(err, 0, 4)
+        for lm in range(nlm):
+            c_xl = np.zeros((N, nlm))
+            c_xl[:, lm] = 1
+            dylm_pred = np.zeros((3, N, nlm))
+            libcider.fill_sph_harm_deriv_coeff(
+                c_xl.ctypes.data_as(ctypes.c_void_p),
+                dylm_pred.ctypes.data_as(ctypes.c_void_p),
+                gaunt_coeff.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(N),
+                ctypes.c_int(lmax),
+            )
+            dylm_pred = (dylm_pred * ylm0).sum(-1)
+            assert_almost_equal(dylm_pred[2], dylm_ref2[3, :, lm], 6)
+            assert_almost_equal(dylm_pred[0], dylm_ref2[1, :, lm], 6)
+            assert_almost_equal(dylm_pred[1], dylm_ref2[2, :, lm], 6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ciderpress/dft/xc_evaluator.py b/ciderpress/dft/xc_evaluator.py
index 18e6bc1..497877c 100644
--- a/ciderpress/dft/xc_evaluator.py
+++ b/ciderpress/dft/xc_evaluator.py
@@ -325,7 +325,6 @@ def __call__(self, X1, res=None, dres=None):
         if dres is None:
             dres = np.zeros(X1.shape)
         elif dres.shape != X1.shape:
-            print(X1.shape, dres.shape)
             raise ValueError
         n = X1.shape[-2]
         for arr in [res, dres, X1]:
@@ -628,11 +627,6 @@ def __init__(self, mapped_kernels, settings, libxc_baseline=None):
         self.settings = settings
         self.libxc_baseline = libxc_baseline
 
-    def set_baseline_mode(self, mode):
-        # TODO baseline can be GPAW or PySCF mode.
-        # Need to implement for more complicated XC.
-        raise NotImplementedError
-
     def __call__(self, X0T, rhocut=0):
         """
         Evaluate functional from normalized features
diff --git a/ciderpress/dft/xc_evaluator2.py b/ciderpress/dft/xc_evaluator2.py
index dc1cd64..366173d 100644
--- a/ciderpress/dft/xc_evaluator2.py
+++ b/ciderpress/dft/xc_evaluator2.py
@@ -277,11 +277,6 @@ def __init__(self, mapped_kernels, settings, libxc_baseline=None):
         self.settings = settings
         self.libxc_baseline = libxc_baseline
 
-    def set_baseline_mode(self, mode):
-        # TODO baseline can be GPAW or PySCF mode.
-        # Need to implement for more complicated XC.
-        raise NotImplementedError
-
     def __call__(self, X0T, rho_tuple, vrho_tuple=None, rhocut=0):
         """
         Evaluate functional from normalized features
diff --git a/ciderpress/dft/xc_models.py b/ciderpress/dft/xc_models.py
deleted file mode 100644
index 74eeed0..0000000
--- a/ciderpress/dft/xc_models.py
+++ /dev/null
@@ -1,591 +0,0 @@
-#!/usr/bin/env python
-# CiderPress: Machine-learning based density functional theory calculations
-# Copyright (C) 2024 The President and Fellows of Harvard College
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>
-#
-# Author: Kyle Bystrom <kylebystrom@gmail.com>
-#
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-import yaml
-from interpolation.splines.eval_cubic_numba import (
-    vec_eval_cubic_splines_1,
-    vec_eval_cubic_splines_2,
-    vec_eval_cubic_splines_3,
-    vec_eval_cubic_splines_4,
-    vec_eval_cubic_splines_G_1,
-    vec_eval_cubic_splines_G_2,
-    vec_eval_cubic_splines_G_3,
-    vec_eval_cubic_splines_G_4,
-)
-
-
-def get_vec_eval(grid, coeffs, X, N):
-    """
-    Call the numba-accelerated spline evaluation routines from the
-    interpolation package. Also returns derivatives
-    Args:
-        grid: start and end points + number of grids in each dimension
-        coeffs: coefficients of the spline
-        X: coordinates to interpolate
-        N: dimension of the interpolation (between 1 and 4, inclusive)
-    """
-    coeffs = np.expand_dims(coeffs, coeffs.ndim)
-    y = np.zeros((X.shape[0], 1))
-    dy = np.zeros((X.shape[0], N, 1))
-    a_, b_, orders = zip(*grid)
-    if N == 1:
-        vec_eval_cubic_splines_G_1(a_, b_, orders, coeffs, X, y, dy)
-    elif N == 2:
-        vec_eval_cubic_splines_G_2(a_, b_, orders, coeffs, X, y, dy)
-    elif N == 3:
-        vec_eval_cubic_splines_G_3(a_, b_, orders, coeffs, X, y, dy)
-    elif N == 4:
-        vec_eval_cubic_splines_G_4(a_, b_, orders, coeffs, X, y, dy)
-    else:
-        raise ValueError("invalid dimension N")
-    return np.squeeze(y, -1), np.squeeze(dy, -1)
-
-
-def get_cubic(grid, coeffs, X, N):
-    """
-    Call the numba-accelerated spline evaluation routines from the
-    interpolation package.
-    Args:
-        grid: start and end points + number of grids in each dimension
-        coeffs: coefficients of the spline
-        X: coordinates to interpolate
-        N: dimension of the interpolation (between 1 and 4, inclusive)
-    """
-    coeffs = np.expand_dims(coeffs, coeffs.ndim)
-    y = np.zeros((X.shape[0], 1))
-    a_, b_, orders = zip(*grid)
-    if N == 1:
-        vec_eval_cubic_splines_1(a_, b_, orders, coeffs, X, y)
-    elif N == 2:
-        vec_eval_cubic_splines_2(a_, b_, orders, coeffs, X, y)
-    elif N == 3:
-        vec_eval_cubic_splines_3(a_, b_, orders, coeffs, X, y)
-    elif N == 4:
-        vec_eval_cubic_splines_4(a_, b_, orders, coeffs, X, y)
-    else:
-        raise ValueError("invalid dimension N")
-    return np.squeeze(y, -1)
-
-
-class Evaluator:
-    """
-    The Evaluator module computes the exchange energy densities
-    and exchange enhancement factors for spline-interpolated XC functionals.
-    """
-
-    def __init__(
-        self,
-        scale,
-        ind_sets,
-        spline_grids,
-        coeff_sets,
-        xed_y_converter,
-        feature_list,
-        desc_order,
-        const=0,
-        desc_version="c",
-        a0=8.0,
-        fac_mul=0.25,
-        amin=0.05,
-        args=None,
-    ):
-        """
-        Args:
-            scale: lists of weights for the spline functions
-            ind_sets: List of index sets. For each set of indices, passes
-                the features at those indices to the spline interpolation
-                functions.
-            spline_grids: grids on which the splines are evaluated
-            coeff_sets: Coefficients for the spline interpolation
-            xed_y_converter (tuple): set of functions for convert between the
-                exchange energy density (xed) and exchange enhancement
-                factor y
-                    (xed_to_y, y_to_xed, eval_baseline_y,
-                    1 if baseline y is LDA, 2 if GGA)
-            feature_list (FeatureList): FeatureList object
-                (ciderpress.models.transform_data.FeatureList) containing
-                the features to pass to the model.
-            desc_order (list): indexes the descriptors before passing them
-                to the features_list
-            const: constant term to add to the spline
-            desc_version ('c'): Version of the descriptors used ('a' or 'c')
-            a0, fac_mul, amin: parameters passed to CIDER feature generator.
-                (see ciderpress.dft.get_gaussian_grid_a/c)
-        """
-        self.scale = scale
-        self.nterms = len(self.scale)
-        self.ind_sets = ind_sets
-        self.spline_grids = spline_grids
-        self.coeff_sets = coeff_sets
-        self.xed_y_converter = xed_y_converter
-        self.feature_list = feature_list
-        self.nfeat = feature_list.nfeat
-        self.desc_order = desc_order
-        self.const = const
-        self.desc_version = desc_version
-        self.a0 = a0
-        self.fac_mul = fac_mul
-        self.amin = amin
-        self.args = args
-        if self.desc_version in ["b", "d"] and self.args is not None:
-            self.vvmul = args.gg_vvmul
-
-    def _xedy(self, y, x, code):
-        if self.xed_y_converter[-1] == 1:
-            return self.xed_y_converter[code](y, x[:, 0])
-        elif self.xed_y_converter[-1] == 2:
-            return self.xed_y_converter[code](y, x[:, 0], x[:, 1])
-        else:
-            return self.xed_y_converter[code](y)
-
-    def xed_to_y(self, y, x):
-        return self._xedy(y, x, 0)
-
-    def y_to_xed(self, y, x):
-        return self._xedy(y, x, 1)
-
-    def get_descriptors(self, x):
-        return self.feature_list(x[:, self.desc_order])
-
-    def predict_from_desc(self, X, vec_eval=False, max_order=3, min_order=0):
-        res = np.zeros(X.shape[0]) + self.const
-        if vec_eval:
-            dres = np.zeros(X.shape)
-        for t in range(self.nterms):
-            if (not isinstance(self.ind_sets[t], int)) and len(
-                self.ind_sets[t]
-            ) < min_order:
-                continue
-            if isinstance(self.ind_sets[t], int) or len(self.ind_sets[t]) <= max_order:
-                ind_set = self.ind_sets[t]
-                if vec_eval:
-                    y, dy = get_vec_eval(
-                        self.spline_grids[t],
-                        self.coeff_sets[t],
-                        X[:, ind_set],
-                        len(ind_set),
-                    )
-                    res += y * self.scale[t]
-                    dres[:, ind_set] += dy * self.scale[t]
-                else:
-                    res += (
-                        get_cubic(
-                            self.spline_grids[t],
-                            self.coeff_sets[t],
-                            X[:, ind_set],
-                            len(ind_set),
-                        )
-                        * self.scale[t]
-                    )
-        if vec_eval:
-            return res, dres
-        else:
-            return res
-
-    def predict(self, X, vec_eval=False):
-        desc = self.get_descriptors(X)
-        F = self.predict_from_desc(desc, vec_eval=vec_eval)
-        if vec_eval:
-            F = F[0]
-        return self.y_to_xed(F, X)
-
-    def to_dict(self):
-        return {
-            "scale": np.array(self.scale),
-            "ind_sets": self.ind_sets,
-            "spline_grids": self.spline_grids,
-            "coeff_sets": self.coeff_sets,
-            "xed_y_converter": self.xed_y_converter,
-            "feature_list": self.feature_list,
-            "desc_order": self.desc_order,
-            "const": self.const,
-            "desc_version": self.desc_version,
-            "a0": self.a0,
-            "fac_mul": self.fac_mul,
-            "amin": self.amin,
-            "args": self.args,
-        }
-
-    @classmethod
-    def from_dict(cls, state_dict):
-        return cls(
-            state_dict["scale"],
-            state_dict["ind_sets"],
-            state_dict["spline_grids"],
-            state_dict["coeff_sets"],
-            state_dict["xed_y_converter"],
-            state_dict["feature_list"],
-            state_dict["desc_order"],
-            const=state_dict["const"],
-            desc_version=state_dict["desc_version"],
-            a0=state_dict["a0"],
-            fac_mul=state_dict["fac_mul"],
-            amin=state_dict["amin"],
-            args=state_dict["args"],
-        )
-
-    def dump(self, fname):
-        """
-        Save the Evaluator to a file name fname as yaml format.
-        """
-        state_dict = self.to_dict()
-        with open(fname, "w") as f:
-            yaml.dump(state_dict, f)
-
-    @classmethod
-    def load(cls, fname):
-        """
-        Load an instance of this class from yaml
-        """
-        with open(fname, "r") as f:
-            state_dict = yaml.load(f, Loader=yaml.CLoader)
-        return cls.from_dict(state_dict)
-
-
-class MLFunctional(ABC):
-    """
-    Abstract base class for Machine-Learned functionals
-    to be evaluated using ciderpress.dft.NLNumInt
-    """
-
-    def get_F(self, X):
-        """
-        Get exchange enhancement factor.
-        """
-        return self.get_F_and_derivative(X)[0]
-
-    def get_derivative(self, X):
-        """
-        Get derivative of exchange enhancement factor
-        with respect to descriptors.
-        """
-        return self.get_F_and_derivative(X)[1]
-
-    @abstractmethod
-    def get_F_and_derivative(self, X):
-        """
-        Get exchange enhancement factor and its derivative
-        with respect to descriptors.
-        """
-
-
-#######################################################
-# Semi-local functionals and simple helper functions. #
-#######################################################
-
-
-def identity(x):
-    return x.copy()
-
-
-def square(x):
-    return x**2
-
-
-def single(x):
-    return np.ones(x.shape)
-
-
-def extract_kernel_components(kernel):
-    """
-    For an sklearn Kernel object composed of
-    three kernels of the form:
-    kernel = (const * comp) + (noise),
-    extract the three components
-    """
-    return kernel.k1.k1, kernel.k1.k2, kernel.k2
-
-
-class Descriptor:
-    """
-    Old version of the Descriptor class (replaced with
-    the models.transform_data module). Used in Semi-local
-    functional examples below.
-    """
-
-    def __init__(self, code, transform=identity, transform_deriv=single, mul=1.0):
-        self._transform = transform
-        self._transform_deriv = transform_deriv
-        self.code = code
-        self.mul = mul
-
-    def transform_descriptor(self, desc, deriv=0):
-        if deriv == 0:
-            return self._transform(desc[self.code])
-        else:
-            return self._transform(desc[self.code]), self._transform_deriv(
-                desc[self.code]
-            )
-
-
-kappa = 0.804
-mu = 0.2195149727645171
-
-
-class PBEFunctional(MLFunctional):
-    """
-    PBE Exchange functional using the MLFunctional class
-    """
-
-    def __init__(self):
-        self.desc_list = [Descriptor(1)]
-        self.y_to_f_mul = None
-
-    def get_F(self, X):
-        p = X.flatten() ** 2
-        return 1 + kappa - kappa / (1 + mu * p / kappa)
-
-    def get_derivative(self, X):
-        p = X.flatten() ** 2
-        return (mu / (1 + mu * p / kappa) ** 2).reshape(-1, 1)
-
-
-class SCANFunctional(MLFunctional):
-    """
-    SCAN Exchange functional using the MLFunctional class
-    """
-
-    def __init__(self):
-        self.desc_list = [Descriptor(1), Descriptor(2)]
-        self.y_to_f_mul = None
-
-    def get_F(self, X):
-        p = X[:, 0] ** 2
-        s = X[:, 0]
-        alpha = X[:, 1]
-        muak = 10.0 / 81
-        k1 = 0.065
-        b2 = np.sqrt(5913 / 405000)
-        b1 = (511 / 13500) / (2 * b2)
-        b3 = 0.5
-        b4 = muak**2 / k1 - 1606 / 18225 - b1**2
-        h0 = 1.174
-        a1 = 4.9479
-        c1 = 0.667
-        c2 = 0.8
-        dx = 1.24
-        tmp1 = muak * p
-        tmp2 = 1 + b4 * p / muak * np.exp(-np.abs(b4) * p / muak)
-        tmp3 = b1 * p + b2 * (1 - alpha) * np.exp(-b3 * (1 - alpha) ** 2)
-        x = tmp1 * tmp2 + tmp3**2
-        h1 = 1 + k1 - k1 / (1 + x / k1)
-        gx = 1 - np.exp(-a1 / np.sqrt(s + 1e-9))
-        -a1 / 4 * (s + 1e-9) ** (-2.5) * np.exp(-a1 / np.sqrt(s + 1e-9))
-        fx = np.exp(-c1 * alpha / (1 - alpha)) * (alpha < 1) - dx * np.exp(
-            c2 / (1 - alpha)
-        ) * (alpha > 1)
-        fx[np.isnan(fx)] = 0
-        assert not np.isnan(fx).any()
-        Fscan = gx * (h1 + fx * (h0 - h1))
-        return Fscan
-
-    def get_derivative(self, X):
-        p = X[:, 0] ** 2
-        s = X[:, 0]
-        alpha = X[:, 1]
-        muak = 10.0 / 81
-        k1 = 0.065
-        b2 = np.sqrt(5913 / 405000)
-        b1 = (511 / 13500) / (2 * b2)
-        b3 = 0.5
-        b4 = muak**2 / k1 - 1606 / 18225 - b1**2
-        h0 = 1.174
-        a1 = 4.9479
-        c1 = 0.667
-        c2 = 0.8
-        dx = 1.24
-        tmp1 = muak * p
-        tmp2 = 1 + b4 * p / muak * np.exp(-np.abs(b4) * p / muak)
-        tmp3 = b1 * p + b2 * (1 - alpha) * np.exp(-b3 * (1 - alpha) ** 2)
-        x = tmp1 * tmp2 + tmp3**2
-        h1 = 1 + k1 - k1 / (1 + x / k1)
-        gx = 1 - np.exp(-a1 / np.sqrt(s + 1e-9))
-        dgdp = -a1 / 4 * (s + 1e-9) ** (-2.5) * np.exp(-a1 / np.sqrt(s + 1e-9))
-        fx = np.exp(-c1 * alpha / (1 - alpha)) * (alpha < 1) - dx * np.exp(
-            c2 / (1 - alpha)
-        ) * (alpha > 1)
-        fx[np.isnan(fx)] = 0
-        assert not np.isnan(fx).any()
-        gx * (h1 + fx * (h0 - h1))
-        dxdp = (
-            muak * tmp2
-            + tmp1
-            * (
-                b4 / muak * np.exp(-np.abs(b4) * p / muak)
-                - b4 * np.abs(b4) * p / muak**2 * np.exp(-np.abs(b4) * p / muak)
-            )
-            + 2 * tmp3 * b1
-        )
-        dxda = (
-            2
-            * tmp3
-            * (
-                -b2 * np.exp(-b3 * (1 - alpha) ** 2)
-                + 2 * b2 * b3 * (1 - alpha) ** 2 * np.exp(-b3 * (1 - alpha) ** 2)
-            )
-        )
-        dhdx = 1 / (1 + x / k1) ** 2
-        dhdp = dhdx * dxdp
-        dhda = dhdx * dxda
-        dfda = (-c1 * alpha / (1 - alpha) ** 2 - c1 / (1 - alpha)) * np.exp(
-            -c1 * alpha / (1 - alpha)
-        ) * (alpha < 1) - dx * c2 / (1 - alpha) ** 2 * np.exp(c2 / (1 - alpha)) * (
-            alpha > 1
-        )
-        dfda[np.isnan(dfda)] = 0
-
-        dFdp = dgdp * (h1 + fx * (h0 - h1)) + gx * (1 - fx) * dhdp
-        dFda = gx * (dhda - fx * dhda + dfda * (h0 - h1))
-        return np.array([dFdp, dFda]).T
-
-
-#########################
-# GP-based functionals. #
-#########################
-
-
-class GPFunctional(MLFunctional):
-    # TODO: This setup currently assumes that the gp directly
-    # predict F_X - 1. This will not always be the case.
-
-    def __init__(self, gpr):
-        # Assumes kernel_ is (const * rbf) + noise
-        msg = """
-        GPFunctional is provided as a reference only. Its functional
-        derivatives are buggy, so please do not use it for practical
-        calculations.
-
-        gpr is a ciderpress.models.gp.DFTGPR object
-        """
-        import warnings
-
-        warnings.warn(msg)
-        from sklearn.gaussian_process.kernels import RBF
-
-        cov = gpr.gp.kernel_.k1
-        self.alpha_ = cov.k1.constant_value * gpr.gp.alpha_
-        self.kernel = RBF(length_scale=cov.k2.length_scale)
-        self.X_train_ = gpr.gp.X_train_[:, 1:]
-        self.feature_list = gpr.feature_list
-        self.nfeat = self.feature_list.nfeat
-        self.desc_version = gpr.args.version[:1]
-        self.a0 = gpr.args.gg_a0
-        self.amin = gpr.args.gg_amin
-        self.fac_mul = gpr.args.gg_facmul
-        self.vvmul = gpr.args.gg_vvmul
-        self.desc_order = gpr.args.desc_order
-        self.fx_baseline = gpr.xed_y_converter[2]
-        self.fxb_num = gpr.xed_y_converter[3]
-        self.args = gpr.args
-
-    def get_F_and_derivative(self, X):
-        rho = X[0]
-        mat = np.zeros((self.nfeat, X.shape[1]))
-        self.feature_list.fill_vals_(mat, X)
-
-        # X has shape n_test, n_desc
-        # X_train_ has shape n_train, n_desc
-        k = self.kernel(mat.T, self.X_train_)
-        F = k.dot(self.alpha_)
-        k * self.alpha_
-        # shape n_test, n_desc
-        kaxt = np.dot(k * self.alpha_, self.X_train_)
-        np.dot(k, self.alpha_)
-        dF = kaxt - mat.T * F.reshape(-1, 1)
-        dF /= self.kernel.length_scale**2
-        dFddesc = np.zeros(X.shape).T
-        self.feature_list.fill_derivs_(dFddesc.T, dF.T, X)
-
-        highcut = 1e-3
-        lowcut = 1e-6
-        rhocut = np.maximum(rho[rho < highcut], lowcut)
-        xcut = np.log(rhocut / lowcut) / np.log(highcut / lowcut)
-        F[rho < highcut] *= 0.5 * (1 - np.cos(np.pi * xcut))
-        dFddesc[rho < highcut, :] *= 0.5 * (1 - np.cos(np.pi * xcut[:, np.newaxis]))
-        dFddesc[rho < lowcut, :] = 0
-
-        if self.fxb_num == 1:
-            chfx = 1
-        elif self.fxb_num == 2:
-            chfx, dchfx = self.fx_baseline(X[1])
-            dFddesc[:, 1] += dchfx
-        else:
-            raise ValueError("Unsupported basline fx order.")
-        F += chfx
-
-        F[rho < 1e-9] = 0
-        dFddesc[rho < 1e-9, :] = 0
-
-        return F, dFddesc
-
-    def get_F(self, X):
-        return self.get_F_and_derivative(X)[0]
-
-    def get_derivative(self, X):
-        return self.get_F_and_derivative(X)[1]
-
-
-class NormGPFunctional(MLFunctional, Evaluator):
-    """
-    MLFunctional to evaluate spline-interpolated GP
-    functionals using the utilities in the Evaluator class.
-    """
-
-    def __init__(self, *args, **kwargs):
-        """
-        Same arguments as Evaluator.
-        """
-        Evaluator.__init__(self, *args, **kwargs)
-        self.fxb_num = self.xed_y_converter[3]
-        self.fx_baseline = self.xed_y_converter[2]
-
-    def get_F_and_derivative(self, X):
-        rho = X[0]
-        mat = np.zeros((self.nfeat, X.shape[1]))
-        self.feature_list.fill_vals_(mat, X)
-        F, dF = self.predict_from_desc(mat.T, vec_eval=True)
-        dFddesc = np.zeros(X.shape).T
-        self.feature_list.fill_derivs_(dFddesc.T, dF.T, X)
-
-        highcut = 1e-8
-        lowcut = 1e-9
-        rhocut = np.maximum(rho[rho < highcut], lowcut)
-        xcut = np.log(rhocut / lowcut) / np.log(highcut / lowcut)
-        F[rho < highcut] *= 0.5 * (1 - np.cos(np.pi * xcut))
-        dFddesc[rho < highcut, :] *= 0.5 * (1 - np.cos(np.pi * xcut[:, np.newaxis]))
-        dFddesc[rho < lowcut, :] = 0
-
-        if self.fxb_num == 1:
-            chfx = 1
-        elif self.fxb_num == 2:
-            chfx, dchfx = self.fx_baseline(X[1])
-            dFddesc[:, 1] += dchfx
-        else:
-            raise ValueError("Unsupported baseline fx order.")
-        F += chfx
-
-        if rho is not None:
-            F[rho < 1e-9] = 0
-            dFddesc[rho < 1e-9, :] = 0
-
-        return F, dFddesc
diff --git a/ciderpress/dft/xc_torch.py b/ciderpress/dft/xc_torch.py
index 86ccafe..c7fec88 100644
--- a/ciderpress/dft/xc_torch.py
+++ b/ciderpress/dft/xc_torch.py
@@ -105,7 +105,6 @@ def eval_gp(Xtst):
 
     my_nn = nncls(X.shape[1], nlayer=nlayer, nhidden=nhidden)
     my_nn = my_nn.double().to(device)
-    print(my_nn)
 
     loss = nn.MSELoss()
     # optimizer = torch.optim.SGD(my_nn.parameters(), lr=1e-2)
@@ -184,7 +183,6 @@ def closure_in():
         Xlst.append(Xp)
         Xlst.append(Xm)
         dy = (eval_gp(Xp) - eval_gp(Xm)) / delta
-        print(dy.shape)
         ylst.append(dy)
     X = np.concatenate(Xlst, axis=0)
     y = np.concatenate(ylst)
@@ -207,16 +205,10 @@ def closure_in():
         else:
             train(X, y, my_nn, loss, optimizer, t, need_closure=True)
     my_nn.eval()
-    import time
-
-    t0 = time.monotonic()
     for i in range(100):
         pred = my_nn(X[:nsamp])
-    t1 = time.monotonic()
-    print(t1 - t0)
     diff = (y[:nsamp] - pred).cpu().detach().numpy()
-    print(np.mean(np.abs(diff)))
-    print(np.sqrt(np.mean(diff**2)))
-    print(np.max(np.abs(diff)))
-    print("Done")
+    print(
+        "Err", np.mean(np.abs(diff)), np.sqrt(np.mean(diff**2)), np.max(np.abs(diff))
+    )
     return my_nn.to(torch.device("cpu"))
diff --git a/ciderpress/gpaw/atom_descriptor_utils.py b/ciderpress/gpaw/atom_descriptor_utils.py
index afc0c58..5fe429e 100644
--- a/ciderpress/gpaw/atom_descriptor_utils.py
+++ b/ciderpress/gpaw/atom_descriptor_utils.py
@@ -327,9 +327,9 @@ def initialize_more_things(self, setups=None):
                     d=0.02,
                 )
                 encut = setup.Z**2 * 2000
-                if encut - 1e-7 <= np.max(self.bas_exp_fit):
-                    encut0 = np.max(self.bas_exp_fit)
-                    Nalpha = self.bas_exp_fit.size
+                if encut - 1e-7 <= np.max(self.alphas):
+                    encut0 = np.max(self.alphas)
+                    Nalpha = self.alphas.size
                 else:
                     Nalpha = (
                         int(np.ceil(np.log(encut / self._amin) / np.log(self.lambd)))
@@ -365,7 +365,7 @@ def initialize_more_things(self, setups=None):
                 setup.ps_setup = pss_cls.from_setup_and_atco(
                     setup,
                     setup.cider_contribs.atco_inp,
-                    self.bas_exp_fit,
+                    self.alphas,
                     self.plan.alpha_norms,
                     encut0,
                     grid_nlm=setup.cider_contribs.nlm,
@@ -374,7 +374,7 @@ def initialize_more_things(self, setups=None):
 
     def calculate_paw_cider_features_p1(self, setups, D_asp, DD_aop, p_o):
         if len(D_asp.keys()) == 0:
-            return {}, {}
+            return {}
         a0 = list(D_asp.keys())[0]
         nspin = D_asp[a0].shape[0]
         assert (nspin == 1) or self.is_cider_functional
@@ -402,7 +402,7 @@ def calculate_paw_cider_features_p1(self, setups, D_asp, DD_aop, p_o):
             self.df_asgLq[a] = df_sgLq
             self.fr_asgLq[a] = fr_sgLq
             self.c_asiq[a] = c_siq
-        return self.c_asiq, self.df_asgLq
+        return self.c_asiq
 
     def calculate_paw_cider_features_p2(self, setups, D_asp, D_aoiq, DD_aop, p_o):
         if len(D_asp.keys()) == 0:
diff --git a/ciderpress/gpaw/atom_utils.py b/ciderpress/gpaw/atom_utils.py
index d1f9461..24407c3 100644
--- a/ciderpress/gpaw/atom_utils.py
+++ b/ciderpress/gpaw/atom_utils.py
@@ -23,6 +23,7 @@
 import numpy as np
 from gpaw.atom.radialgd import EquidistantRadialGridDescriptor
 from gpaw.sphere.lebedev import R_nv, Y_nL, weight_n
+from gpaw.utilities.timing import Timer
 from gpaw.xc.pawcorrection import rnablaY_nLv
 from gpaw.xc.vdw import spline
 from scipy.linalg import cho_factor, cho_solve
@@ -76,7 +77,6 @@ def get_ag_indices(fft_obj, gd, shape, spos_c, rmax, buffer=0, get_global_disps=
     vol = np.abs(np.linalg.det(lattice))
     for i in range(3):
         res = np.cross(lattice[(i + 1) % 3], lattice[(i + 2) % 3])
-        # TODO unnecessarily conservative buffer?
         disp[i] = np.ceil(np.linalg.norm(res) * rmax / vol * shape[i]) + 1 + buffer
     indices = [
         np.arange(center[i] - disp[i], center[i] + disp[i] + 1) for i in range(3)
@@ -144,7 +144,6 @@ def __init__(
 
     @classmethod
     def from_setup(cls, setup, rmin=1e-4, d=0.03, encut=1e6, N=None):
-        # rcut = np.max(setup.rcut_j)
         rmax = setup.rgd.r_g[-1]
         if N is not None:
             assert isinstance(N, int)
@@ -211,6 +210,9 @@ def from_gd_and_setup(
         if ovlp_fit and not is_global:
             raise ValueError("Need global grid set for overlap fitting")
         rgd = psetup.interp_rgd
+        # Make sure the grid is linear, as this is assumed
+        # by the calculation of dg below and by the get_grads function.
+        assert isinstance(rgd, EquidistantRadialGridDescriptor)
         if rmax == 0:
             rmax = psetup.rcut
         shape = gd.get_size_of_global_array()
@@ -228,7 +230,7 @@ def from_gd_and_setup(
             rhat_gv = rhat_gv[cond]
 
         h = rgd.r_g[1] - rgd.r_g[0]
-        dg = rad_g / h  # TODO only for equidist grid
+        dg = rad_g / h
         g = np.floor(dg).astype(np.int32)
         g = np.minimum(g, rgd.r_g.size - 1)
         dg -= g
@@ -300,7 +302,6 @@ def get_grads(self, pfunc=True):
         ylm, dylm = pwutil.recursive_sph_harm_deriv(Lmax, self.rhat_gv)
         dylm /= self.rad_g[:, None, None] + 1e-8  # TODO right amount of regularization?
         rodylm = np.ascontiguousarray(np.einsum("gv,gvL->Lg", self.rhat_gv, dylm))
-        # dylm = np.dot(dylm, drhat_g.T)
         funcs_ig = pwutil.eval_pasdw_funcs(
             radderivs_ng,
             np.ascontiguousarray(ylm.T),
@@ -409,21 +410,15 @@ def __init__(
     ):
         self.plan = plan
         self.cider_kernel = cider_kernel
-        # NOTE k-space convolution should be default in production
-        # version for now, since ccl is high-memory (needs on-the-fly
-        # integral generation) and since reciprocal-space convolutions
-        # generate the fitting procedure in the PSmoothSetup
         self._atco = atco
         self.w_g = (xcc.rgd.dv_g[:, None] * weight_n).ravel()
         self.r_g = xcc.rgd.r_g
         self.xcc = xcc
-        # TODO might want more contrib over grids_indexer nlm,
+        # TODO might want more control over grids_indexer nlm,
         # currently it is just controlled by size of Y_nL in GPAW.
         self.grids_indexer = AtomicGridsIndexer.make_single_atom_indexer(Y_nL, self.r_g)
         self.grids_indexer.set_weights(self.w_g)
         self._paw_algo = paw_algo
-        from gpaw.utilities.timing import Timer
-
         self.timer = Timer()
 
     @property
@@ -1254,15 +1249,16 @@ class FastPASDWCiderKernel:
 
     fr_asgLq: dict
     df_asgLq: dict
-    # TODO below vfr and vdf not needed, can reuse fr and df to save memory
     vfr_asgLq: dict
     vdf_asgLq: dict
-    rho_asxg: dict
-    vrho_asxg: dict
-    rhot_asxg: dict
-    vrhot_asxg: dict
-    # TODO This can be reused for D_asiq and vc_asiq
-    c_asiq: dict
+    # TODO if calculating the density (which done 3X per loop in
+    # this implementation) and potential (done 2X per loop)
+    # ever becomes a bottleneck, it can
+    # be saved in the below variables to avoid recomputing.
+    # rho_asxg: dict
+    # vrho_asxg: dict
+    # rhot_asxg: dict
+    # vrhot_asxg: dict
 
     def __init__(
         self,
@@ -1278,16 +1274,11 @@ def __init__(
 
             self.timer = Timer()
         self.gd = gd
-        # TODO refactor this, "bas_exp_fit" not used anywhere else.
-        # Also "alphas" as indices aren't used anywhere in the new
-        # version, so there's no confusion just calling it alphas
-        # anymore
-        self.bas_exp_fit = plan.alphas
         self.cider_kernel = cider_kernel
         self.Nalpha_small = plan.nalpha
         self.cut_xcgrid = cut_xcgrid
-        self._amin = np.min(self.bas_exp_fit)
         self.plan = plan
+        self._amin = np.min(self.alphas)
         self.is_mgga = plan.nldf_settings.sl_level == "MGGA"
         self._paw_algo = paw_algo
         if self._paw_algo not in ["v1", "v2"]:
@@ -1297,6 +1288,10 @@ def __init__(
     def lambd(self):
         return self.plan.lambd
 
+    @property
+    def alphas(self):
+        return self.plan.alphas
+
     def get_D_asp(self):
         return self.atomdist.to_work(self.dens.D_asp)
 
@@ -1315,7 +1310,7 @@ def initialize_more_things(self, setups=None):
                 setup.xc_correction = DiffPAWXCCorrection.from_setup(
                     setup, build_kinetic=self.is_mgga, ke_order_ng=False
                 )
-                # TODO it would be good to remove this
+                # TODO it would be good to be able to remove this.
                 # Old version of the code used these settings:
                 # setup, rmin=setup.xc_correction.rgd.r_g[0]+1e-5,
                 # N=1024, encut=1e6, d=0.018
@@ -1336,9 +1331,9 @@ def initialize_more_things(self, setups=None):
                 # value and pass raise_large_expnt_error=False
                 # to the initializer below just in case to avoid crashes.
                 encut = setup.Z**2 * 2000
-                if encut - 1e-7 <= np.max(self.bas_exp_fit):
-                    encut0 = np.max(self.bas_exp_fit)
-                    Nalpha = self.bas_exp_fit.size
+                if encut - 1e-7 <= np.max(self.alphas):
+                    encut0 = np.max(self.alphas)
+                    Nalpha = self.alphas.size
                 else:
                     Nalpha = (
                         int(np.ceil(np.log(encut / self._amin) / np.log(self.lambd)))
@@ -1374,7 +1369,7 @@ def initialize_more_things(self, setups=None):
                 setup.ps_setup = pss_cls.from_setup_and_atco(
                     setup,
                     setup.cider_contribs.atco_inp,
-                    self.bas_exp_fit,
+                    self.alphas,
                     self.plan.alpha_norms,
                     encut0,
                     grid_nlm=setup.cider_contribs.nlm,
@@ -1394,13 +1389,13 @@ def calculate_paw_cider_features(self, setups, D_asp):
             D_asp: Atomic PAW density matrices
         """
         if len(D_asp.keys()) == 0:
-            return {}, {}
+            return {}
         a0 = list(D_asp.keys())[0]
         nspin = D_asp[a0].shape[0]
         assert (nspin == 1) or self.is_cider_functional
         self.fr_asgLq = {}
         self.df_asgLq = {}
-        self.c_asiq = {}
+        c_asiq = {}
 
         for a, D_sp in D_asp.items():
             setup = setups[a]
@@ -1431,8 +1426,8 @@ def calculate_paw_cider_features(self, setups, D_asp):
             self.timer.stop()
             self.df_asgLq[a] = df_sgLq
             self.fr_asgLq[a] = fr_sgLq
-            self.c_asiq[a] = c_siq
-        return self.c_asiq, self.df_asgLq
+            c_asiq[a] = c_siq
+        return c_asiq
 
     def calculate_paw_cider_energy(self, setups, D_asp, D_asiq):
         """
@@ -1513,6 +1508,10 @@ def calculate_paw_cider_energy(self, setups, D_asp, D_asiq):
                 ) * dv_g[:, None]
             self.vfr_asgLq[a] = vfr_sgLq
             self.vdf_asgLq[a] = vf_sgLq
+            # This step saves some memory since fr_asgLq and df_asgLq are not
+            # needed anymore after this step.
+            self.fr_asgLq[a] = None
+            self.df_asgLq[a] = None
         return dvD_asiq, deltaE, deltaV
 
     def calculate_paw_cider_potential(self, setups, D_asp, vc_asiq):
@@ -1568,7 +1567,6 @@ def calculate_paw_feat_corrections(
         setups,
         D_asp,
         D_asiq=None,
-        df_asgLq=None,
         vc_asiq=None,
     ):
         """
@@ -1595,14 +1593,12 @@ def calculate_paw_feat_corrections(
         Args:
             setups: GPAW atomic setups
             D_asp: Atomic PAW density matrices
-            D_sabi: Atomic PAW PASDW projections
-            df_asbLg: delta-F_beta convolutions
-            vc_sabi: functional derivatives with respect to coefficients c_sabi
+            D_asiq: Atomic PAW PASDW projections
+            vc_asiq: functional derivatives with respect to coefficients c_sabi
         """
         if D_asiq is None and vc_asiq is None:
             return self.calculate_paw_cider_features(setups, D_asp)
         elif D_asiq is not None:
-            assert df_asgLq is not None
             return self.calculate_paw_cider_energy(setups, D_asp, D_asiq)
         else:
             assert vc_asiq is not None
@@ -1733,9 +1729,8 @@ def get_fcut(self, r):
     @classmethod
     def from_setup(cls, setup):
         rgd = setup.xc_correction.rgd
-        RCUT_FAC = 1.0  # TODO was 1.3
+        RCUT_FAC = 1.0
         rcut_feat = np.max(setup.rcut_j) * RCUT_FAC
-        # rcut_func = rcut_feat * 2.0
         rmax = np.max(setup.xc_correction.rgd.r_g)
         rcut_func = rmax * 0.8
 
@@ -2013,7 +2008,6 @@ def basis2grid_spin(self, p_suq, p_srlmq=None):
         nspin = p_suq.shape[0]
         nalpha = p_suq.shape[2]
         assert p_suq.shape[1] == self.nu
-        # TODO don't hard code lmax
         ngrid = self.funcs_ig.shape[1]
         if p_srlmq is None:
             p_srlmq = np.zeros((nspin, ngrid, self.nlm, nalpha))
@@ -2027,7 +2021,7 @@ def grid2basis_spin(self, p_srlmq, p_suq=None):
         nspin = p_srlmq.shape[0]
         nalpha = p_srlmq.shape[3]
         assert p_srlmq.shape[1] == self.funcs_ig.shape[1]
-        # TODO check lmax
+        assert p_srlmq.shape[2] == self.nlm
         if p_suq is None:
             p_suq = np.zeros((nspin, self.nu, nalpha))
         for s in range(nspin):
diff --git a/ciderpress/gpaw/calculator.py b/ciderpress/gpaw/calculator.py
index 8178996..efef283 100644
--- a/ciderpress/gpaw/calculator.py
+++ b/ciderpress/gpaw/calculator.py
@@ -132,12 +132,22 @@ class is returned to evaluate the semilocal functional.
     """
 
     mlfunc = load_cider_model(mlfunc, mlfunc_format)
-    if mlfunc.settings.sl_settings.level == "MGGA":
-        mlfunc.desc_version = "b"
-    else:
-        mlfunc.desc_version = "d"
+    sl_level = mlfunc.settings.sl_settings.level
+    if not mlfunc.settings.nlof_settings.is_empty:
+        raise NotImplementedError("Nonlocal orbital features in GPAW")
+    elif not mlfunc.settings.sdmx_settings.is_empty:
+        raise NotImplementedError("SDMX features in GPAW")
+    elif not mlfunc.settings.nldf_settings.is_empty:
+        if mlfunc.settings.nldf_settings.version != "j":
+            raise NotImplementedError(
+                "Currently only version j NLDF features are implemented in GPAW. "
+                "Other versions are planned for future development. The version "
+                "of this functional's NLDF features is: {}".format(
+                    mlfunc.settings.nldf_settings.version
+                )
+            )
 
-    if mlfunc.desc_version == "b":
+    if sl_level == "MGGA":
         no_nldf = mlfunc.settings.nldf_settings.is_empty
         if no_nldf and not _force_nonlocal:
             # functional is semi-local MGGA
@@ -148,7 +158,7 @@ class is returned to evaluate the semilocal functional.
             cls = CiderMGGAPASDW
         else:
             cls = CiderMGGA
-    elif mlfunc.desc_version == "d":
+    else:
         no_nldf = mlfunc.settings.nldf_settings.is_empty
         if no_nldf and not _force_nonlocal:
             # functional is semi-local GGA
@@ -159,9 +169,6 @@ class is returned to evaluate the semilocal functional.
             cls = CiderGGAPASDW
         else:
             cls = CiderGGA
-    else:
-        msg = "Only implemented for b and d version, found {}"
-        raise ValueError(msg.format(mlfunc.desc_version))
 
     if use_paw:
         xc = cls(
diff --git a/ciderpress/gpaw/cider_fft.py b/ciderpress/gpaw/cider_fft.py
index db40ede..35b3a66 100644
--- a/ciderpress/gpaw/cider_fft.py
+++ b/ciderpress/gpaw/cider_fft.py
@@ -483,9 +483,8 @@ def __init__(self, cider_kernel, **kwargs):
 
         GGA.__init__(self, LibXC("PBE"), stencil=2)
         self.type = "GGA"
-        self.name = "CIDER_{}".format(
-            self.cider_kernel.xmix
-        )  # TODO more informative name
+        # TODO more informative name for the functional
+        self.name = "CIDER_{}".format(self.cider_kernel.xmix)
 
     def todict(self):
         d = super(CiderGGA, self).todict()
diff --git a/ciderpress/gpaw/cider_kernel.py b/ciderpress/gpaw/cider_kernel.py
index 242face..9f5fa20 100644
--- a/ciderpress/gpaw/cider_kernel.py
+++ b/ciderpress/gpaw/cider_kernel.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import numpy as np
 from gpaw.xc.kernel import XCKernel
 from gpaw.xc.libxc import LibXC
diff --git a/ciderpress/gpaw/cider_paw.py b/ciderpress/gpaw/cider_paw.py
index 4d0e247..e03217c 100644
--- a/ciderpress/gpaw/cider_paw.py
+++ b/ciderpress/gpaw/cider_paw.py
@@ -396,9 +396,7 @@ def _set_paw_terms(self):
         D_asp = self.get_D_asp()
         if not (D_asp.partition.rank_a == self.atom_partition.rank_a).all():
             raise ValueError("rank_a mismatch")
-        self.c_asiq, self.df_asgLq = self.paw_kernel.calculate_paw_feat_corrections(
-            self.setups, D_asp
-        )
+        self.c_asiq = self.paw_kernel.calculate_paw_feat_corrections(self.setups, D_asp)
         self._save_c_asiq = {k: v.copy() for k, v in self.c_asiq.items()}
         self.D_asp = D_asp
         self.timer.stop()
@@ -408,7 +406,7 @@ def _calculate_paw_energy_and_potential(self, grad_mode=CIDERPW_GRAD_MODE_NONE):
         if grad_mode == CIDERPW_GRAD_MODE_STRESS:
             ctmp_asiq = {a: c_siq.copy() for a, c_siq in self.c_asiq.items()}
         self.c_asiq, deltaE, deltaV = self.paw_kernel.calculate_paw_feat_corrections(
-            self.setups, self.D_asp, D_asiq=self.c_asiq, df_asgLq=self.df_asgLq
+            self.setups, self.D_asp, D_asiq=self.c_asiq
         )
         self.E_a_tmp = deltaE
         self.deltaV = deltaV
diff --git a/ciderpress/gpaw/cider_sl.py b/ciderpress/gpaw/cider_sl.py
index fa52ab9..ab20f8c 100644
--- a/ciderpress/gpaw/cider_sl.py
+++ b/ciderpress/gpaw/cider_sl.py
@@ -1,4 +1,23 @@
-import joblib
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import numpy as np
 import yaml
 
@@ -59,28 +78,6 @@ def todict(self):
     def get_mlfunc_data(self):
         return yaml.dump(self.kernel.mlfunc, Dumper=yaml.CDumper)
 
-    @classmethod
-    def from_joblib(cls, fname, **kwargs):
-        mlfunc = joblib.load(fname)
-        return cls.from_mlfunc(mlfunc, **kwargs)
-
-    @staticmethod
-    def from_mlfunc(mlfunc, xmix=1.00, xkernel="GGA_X_PBE", ckernel="GGA_C_PBE"):
-        if mlfunc.desc_version == "b":
-            cider_kernel = SLCiderMGGAHybridWrapper(mlfunc, xmix, xkernel, ckernel)
-            cls = SLCiderMGGA
-        elif mlfunc.desc_version == "d":
-            cider_kernel = SLCiderGGAHybridWrapper(mlfunc, xmix, xkernel, ckernel)
-            cls = SLCiderGGA
-        else:
-            raise ValueError(
-                "Only implemented for b and d version, found {}".format(
-                    mlfunc.desc_version
-                )
-            )
-
-        return cls(cider_kernel)
-
 
 class SLCiderGGA(_SLCiderBase, DiffGGA):
     def todict(self):
diff --git a/ciderpress/gpaw/descriptors.py b/ciderpress/gpaw/descriptors.py
index fd07aa8..2e65d1a 100644
--- a/ciderpress/gpaw/descriptors.py
+++ b/ciderpress/gpaw/descriptors.py
@@ -83,6 +83,9 @@ def get_descriptors(
         use_paw (bool, True): Whether to use PAW corrections.
         screen_dens (bool, True): Whether to remove low-density
             grids from the return feature vectors.
+        kwargs: Any additional arguments to be passed to the
+            Cider functional object (like qmax, lambd, etc.)
+            to compute the functional.
 
     Returns:
         if p_i is None:
@@ -797,7 +800,7 @@ def _set_paw_terms(self):
             D_asp = self.get_D_asp()
             if not (D_asp.partition.rank_a == self.atom_partition.rank_a).all():
                 raise ValueError("rank_a mismatch")
-            self.c_asiq, self.df_asgLq = self.paw_kernel.calculate_paw_feat_corrections(
+            self.c_asiq = self.paw_kernel.calculate_paw_feat_corrections(
                 self.setups, D_asp
             )
             self.D_asp = D_asp
@@ -805,10 +808,9 @@ def _set_paw_terms(self):
             D_asp = self.get_D_asp()
             if not (D_asp.partition.rank_a == self.atom_partition.rank_a).all():
                 raise ValueError("rank_a mismatch")
-            res = self.paw_kernel.calculate_paw_cider_features_p1(
+            self.c_asiq = self.paw_kernel.calculate_paw_cider_features_p1(
                 self.setups, D_asp, self._DD_aop, self._p_o
             )
-            self.c_asiq, self.df_asgLq = res
             self.D_asp = D_asp
 
 
diff --git a/ciderpress/gpaw/fit_paw_gauss_pot.py b/ciderpress/gpaw/fit_paw_gauss_pot.py
index 5a5b73d..5727dbb 100644
--- a/ciderpress/gpaw/fit_paw_gauss_pot.py
+++ b/ciderpress/gpaw/fit_paw_gauss_pot.py
@@ -208,7 +208,7 @@ def get_p22_matrix(phi_iabg, rgd, rcut, l, w_b, nbas_loc, reg=0, cut_func=False)
     nj, na = p22_jaja.shape[:2]
     p22_ii = p22_jaja.view()
     p22_ii.shape = (nj * na, nj * na)
-    # TODO this is closed to previous regularization, but not
+    # TODO this is close to previous regularization, but not
     # necessarily the optimal regularization
     reg = reg / np.tile(w_b, nj)
     p22_ii[:] += reg * np.identity(nj * na)
diff --git a/ciderpress/gpaw/interp_paw.py b/ciderpress/gpaw/interp_paw.py
index d859526..9738597 100644
--- a/ciderpress/gpaw/interp_paw.py
+++ b/ciderpress/gpaw/interp_paw.py
@@ -296,8 +296,6 @@ def __init__(
         self.taut_npg = taut_npg
         self.tauc_g = tauc_g
         self.tauct_g = tauct_g
-        # print('SETUP MINS AE', np.min(self.tauc_g - self.dc_g**2 / (8 * self.nc_g + 1e-16)))
-        # print('SETUP MINS PS', np.min(self.tauct_g - self.dct_g**2 / (8 * self.nct_g + 1e-16)))
         if self.tau_npg is not None:
             NP = self.tau_npg.shape[1]
             self.tau_pg = np.ascontiguousarray(
@@ -576,56 +574,6 @@ def __call__(self, rgd, D_sLq, n_qg, dndr_qg, nc0_sg, dnc0dr_sg, ae=True):
 
             return E, dEdD_sqL
 
-        elif self.rcalc.mode == "potential2":
-
-            """
-            WARNING TODO: DRAFT CODE
-            """
-            dedn_sg, b_vsg, a_sg, dedsigma_xg = self.rcalc(
-                rgd,
-                n_sLg,
-                Y_nL[:, :Lmax],
-                dndr_sLg,
-                rnablaY_nLv[:, :Lmax],
-                self.F_sbg,
-                self.y_sbg,
-                ae=ae,
-            )
-
-            nn = Y_nL.shape[0]
-
-            dedn_sng = dedn_sg.reshape(nspins, nn, -1)
-            dedsigma_xng = dedsigma_xg.reshape(2 * nspins - 1, nn, -1)
-            b_vsng = b_vsg.reshape(3, nspins, nn, -1)
-            a_sng = a_sg.reshape(nspins, nn, -1)
-
-            dEdD_snq = np.dot(dedn_sng, n_qg.T)
-            dEdD_snq += 2 * np.dot(dedsigma_xng[::nspins] * a_sng, dndr_qg.T)
-            if nspins == 2:
-                dEdD_snq += np.dot(dedsigma_xng[1] * a_sng[::-1], dndr_qg.T)
-
-            dEdD_sqL = np.einsum(
-                "snq,nL->sqL", dEdD_snq, weight_n[:, None] * Y_nL[:, :Lmax]
-            )
-
-            # dedsigma_xng *= rgd.dr_g
-            dedsigma_xng *= 1.0 / (4 * np.pi * rgd.r_g * rgd.r_g)
-            B_vsng = dedsigma_xng[::2] * b_vsng
-            if nspins == 2:
-                B_vsng += 0.5 * dedsigma_xng[1] * b_vsng[:, ::-1]
-            B_vsnq = np.dot(B_vsng, n_qg.T)
-            dEdD_sqL += (
-                8
-                * np.pi
-                * np.einsum(
-                    "nLv,vsnq->sqL",
-                    weight_n[:, None, None] * rnablaY_nLv[:, :Lmax],
-                    B_vsnq,
-                )
-            )
-
-            return dEdD_sqL
-
         elif self.rcalc.mode == "energy_v2":
             e_g, dedn_sg, dedgrad_svg, v_sbg = self.rcalc(
                 rgd,
diff --git a/ciderpress/gpaw/nldf_interface.py b/ciderpress/gpaw/nldf_interface.py
index 2fc97d5..c4b9f82 100644
--- a/ciderpress/gpaw/nldf_interface.py
+++ b/ciderpress/gpaw/nldf_interface.py
@@ -540,43 +540,3 @@ def wrap_fft_mpi(pd):
     fft_obj.initialize_backend()
     wrapper = FFTWrapper(fft_obj, distribution, pd)
     return wrapper
-
-
-if __name__ == "__main__":
-    from gpaw.mpi import MPIComm, world
-
-    np.random.seed(98)
-
-    comm = MPIComm(world)
-    N_c = [100, 80, 110]
-    cell_cv = np.diag(np.ones(3))
-    ciderpw = LibCiderPW(N_c, cell_cv, comm)
-    ciderpw.initialize_backend()
-
-    from gpaw.pw.descriptor import PWDescriptor
-
-    gd = GridDescriptor(N_c, cell_cv, comm=MPIComm(world))
-    pd = PWDescriptor(180, gd, dtype=float)
-
-    wrapper = wrap_fft_mpi(pd)
-
-    input = np.random.normal(size=N_c)[None, :, :, :]
-    input = gd.distribute(input)
-
-    output_ref = pd.fft(input)
-    sum = np.abs(output_ref).sum()
-    sum = pd.gd.comm.sum_scalar(sum)
-    import time
-
-    t0 = time.monotonic()
-    for i in range(20):
-        output_test = wrapper.fft(input)
-    t1 = time.monotonic()
-    print("TIME", t1 - t0)
-    input_test = wrapper.ifft(output_test)
-    input_ref = pd.ifft(output_ref)
-
-    from numpy.testing import assert_allclose
-
-    assert_allclose(output_test[0], output_ref)
-    assert_allclose(input_test[0], input_ref)
diff --git a/ciderpress/gpaw/tests/test_basic_calc.py b/ciderpress/gpaw/tests/test_basic_calc.py
index 88d9975..260fae1 100644
--- a/ciderpress/gpaw/tests/test_basic_calc.py
+++ b/ciderpress/gpaw/tests/test_basic_calc.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_descriptors.py b/ciderpress/gpaw/tests/test_descriptors.py
index 6866aa0..e3a586d 100644
--- a/ciderpress/gpaw/tests/test_descriptors.py
+++ b/ciderpress/gpaw/tests/test_descriptors.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_fe_force.py b/ciderpress/gpaw/tests/test_fe_force.py
index c5abf1f..d5ffd37 100644
--- a/ciderpress/gpaw/tests/test_fe_force.py
+++ b/ciderpress/gpaw/tests/test_fe_force.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_fe_stress.py b/ciderpress/gpaw/tests/test_fe_stress.py
index ddd61fc..d4dd1c2 100644
--- a/ciderpress/gpaw/tests/test_fe_stress.py
+++ b/ciderpress/gpaw/tests/test_fe_stress.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_nldf_interface.py b/ciderpress/gpaw/tests/test_nldf_interface.py
new file mode 100644
index 0000000..1547a5e
--- /dev/null
+++ b/ciderpress/gpaw/tests/test_nldf_interface.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
+import unittest
+
+import numpy as np
+from gpaw.mpi import MPIComm, world
+from gpaw.pw.descriptor import PWDescriptor
+from numpy.testing import assert_allclose
+
+from ciderpress.gpaw.nldf_interface import GridDescriptor, LibCiderPW, wrap_fft_mpi
+
+
+class TestLibCiderPW(unittest.TestCase):
+    def test_pwutil(self):
+        np.random.seed(98)
+        comm = MPIComm(world)
+        N_c = [100, 80, 110]
+        cell_cv = np.diag(np.ones(3))
+        ciderpw = LibCiderPW(N_c, cell_cv, comm)
+        ciderpw.initialize_backend()
+        gd = GridDescriptor(N_c, cell_cv, comm=MPIComm(world))
+        pd = PWDescriptor(180, gd, dtype=float)
+        wrapper = wrap_fft_mpi(pd)
+        input = np.random.normal(size=N_c)[None, :, :, :]
+        input = gd.distribute(input)
+        output_ref = pd.fft(input)
+        sum = np.abs(output_ref).sum()
+        sum = pd.gd.comm.sum_scalar(sum)
+        for i in range(20):
+            output_test = wrapper.fft(input)
+        input_test = wrapper.ifft(output_test)
+        input_ref = pd.ifft(output_ref)
+        assert_allclose(output_test[0], output_ref)
+        assert_allclose(input_test[0], input_ref)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/ciderpress/gpaw/tests/test_pp_force.py b/ciderpress/gpaw/tests/test_pp_force.py
index 4531ec5..d7ffdf4 100644
--- a/ciderpress/gpaw/tests/test_pp_force.py
+++ b/ciderpress/gpaw/tests/test_pp_force.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_read_write.py b/ciderpress/gpaw/tests/test_read_write.py
index b0dd8cd..6bc11f8 100644
--- a/ciderpress/gpaw/tests/test_read_write.py
+++ b/ciderpress/gpaw/tests/test_read_write.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_si_force.py b/ciderpress/gpaw/tests/test_si_force.py
index 84426ea..cb74754 100644
--- a/ciderpress/gpaw/tests/test_si_force.py
+++ b/ciderpress/gpaw/tests/test_si_force.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/gpaw/tests/test_si_stress.py b/ciderpress/gpaw/tests/test_si_stress.py
index c919040..605ff1e 100644
--- a/ciderpress/gpaw/tests/test_si_stress.py
+++ b/ciderpress/gpaw/tests/test_si_stress.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/lib/CMakeLists.txt b/ciderpress/lib/CMakeLists.txt
index fef84ba..fc025ec 100644
--- a/ciderpress/lib/CMakeLists.txt
+++ b/ciderpress/lib/CMakeLists.txt
@@ -130,9 +130,7 @@ if(ENABLE_FFTW AND BUILD_FFTW)
     add_dependencies(pwutil libfftw3)
 endif()
 
-# TODO some other stuff from pyscf CMake file was here
-
-# TODO some other stuff from pyscf CMake file was here
+# NOTE some other stuff from pyscf CMake file was here
 
 if(EXISTS "${PROJECT_SOURCE_DIR}/cmake.user.inc")
   include("${PROJECT_SOURCE_DIR}/cmake.user.inc")
diff --git a/ciderpress/lib/mod_cider/conv_interpolation.c b/ciderpress/lib/mod_cider/conv_interpolation.c
index ff5dfce..832194b 100644
--- a/ciderpress/lib/mod_cider/conv_interpolation.c
+++ b/ciderpress/lib/mod_cider/conv_interpolation.c
@@ -23,6 +23,7 @@
 #include "sph_harm.h"
 #include "spline.h"
 #include <math.h>
+#include <omp.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -301,6 +302,25 @@ void compute_num_spline_contribs_multi(spline_locator *spline_locs,
     }
 }
 
+/**
+ * Compute the number of coordinates (coords) that fall within each
+ * radial spline block. The result (num_ai) can then be used
+ * in compute_spline_ind_order_new
+ * to order the grids by their distance from the atom at atm_coord.
+ * num_ai: The result, num_ai[ia * nrad + ir] contains the number of
+ *         grid points that fall in spline index ir of atom with index ia.
+ * coords: coords + 3 * g is the real-space coordinate of grid g.
+ * atm_coords: atm_coords + 3 * ia is the real-space coordinate of atom ia.
+ * aparam, dparam: The radial grid with index ir is
+ *                 aparam * (exp(dparam * ir) - 1)
+ * natm: Number of atoms
+ * ngrids: Number of 3D real-space grids
+ * nrad: Number of grids for the radial spline on each atom.
+ * iatom_g: coords + 3 * g is part of the atomic grid belonging
+ *          to the atom with index iatom_g[g]. If iatom_g is NULL,
+ *          it is ignored. If it is not NULL, it is used to ignore
+ *          on-site grids when constructing num_ai.
+ */
 void compute_num_spline_contribs_new(int *num_ai, double *coords,
                                      double *atm_coords, double aparam,
                                      double dparam, int natm, int ngrids,
@@ -431,6 +451,31 @@ void compute_spline_ind_order(int *loc_i, double *coords, double *atm_coord,
     free(num_i_tmp);
 }
 
+/**
+ * TODO this function should be modified to run in parallel.
+ * This function sorts the grid coordinates (coords) in order of their
+ * distance from the atomic coordinate (atm_coord).
+ * loc_i: For each radial index ir, loc_i[ir] says where each batch
+ *        corresponding to index ir of the radial spline is located
+ *        in the coords_ord array. It is constructed in the
+ *        _set_num_ai function of ciderpress.dft.lcao_interpolation.
+ * coords: 3-D grid coordinations
+ * atm_coord: Atomic coordinate
+ * coords_ord: OUTPUT. The ordered coordinates are saved to this array.
+ * ind_ord_fwd, ind_ord_bwd: OUTPUT. After execution, for x in 0, 1, 2,
+ *         the following relationships hold:
+ *         coords_ord[3 * g + x] == coords[3 * ind_ord_fwd[g] + x]
+ *         coords_ord[3 * ind_ord_bwd[g] + x] == coords[3 * g + x]
+ * aparam, dparam: The radial grid with index ir is
+ *                 aparam * (exp(dparam * ir) - 1)
+ * ngrids: Number of 3D real-space grids
+ * nrad: Number of grids for the radial spline on each atom.
+ * iatom_g: coords + 3 * g is part of the atomic grid belonging
+ *          to the atom with index iatom_g[g]. If iatom_g is NULL,
+ *          it is ignored. If it is not NULL, it is used to ignore
+ *          on-site grids when constructing num_ai.
+ * iatom: Index of the atom of for which the grids are being ordered.
+ */
 void compute_spline_ind_order_new(int *loc_i, double *coords, double *atm_coord,
                                   double *coords_ord, int *ind_ord_fwd,
                                   int *ind_ord_bwd, double aparam,
@@ -737,8 +782,8 @@ void compute_pot_convs_single_new(double *f_gq, double *f_rlpq, double *auxo_gl,
         int q, ir, g, p;
         double *f_lpq;
         double *auxo_tmp_gl;
-        double BETA =
-            0; // TODO beta should be 1, not 0, when doing grid batches
+        // TODO beta should be 1, not 0, when doing grid batches
+        double BETA = 0;
         double ALPHA = 1;
         char NTRANS = 'N';
         char TRANS = 'T';
@@ -1209,8 +1254,8 @@ void add_lp1_term_onsite_bwd(double *f, double *coords, int natm,
 }
 
 // TODO might want to move this somewhere else
-void contract_grad_terms(double *excsum, double *f_g, int natm, int a, int v,
-                         int ngrids, int *ga_loc) {
+void contract_grad_terms_old(double *excsum, double *f_g, int natm, int a,
+                             int v, int ngrids, int *ga_loc) {
     double *tmp = (double *)calloc(natm, sizeof(double));
     int ib;
 #pragma omp parallel
@@ -1231,9 +1276,8 @@ void contract_grad_terms(double *excsum, double *f_g, int natm, int a, int v,
     free(tmp);
 }
 
-void contract_grad_terms2(double *excsum, double *f_g, int natm, int a, int v,
-                          int ngrids, int *atm_g) {
-    // TODO neeed to parallelize
+void contract_grad_terms_serial(double *excsum, double *f_g, int natm, int a,
+                                int v, int ngrids, int *atm_g) {
     double *tmp = (double *)calloc(natm, sizeof(double));
     int ib;
     int ia;
@@ -1247,3 +1291,37 @@ void contract_grad_terms2(double *excsum, double *f_g, int natm, int a, int v,
     }
     free(tmp);
 }
+
+void contract_grad_terms_parallel(double *excsum, double *f_g, int natm, int a,
+                                  int v, int ngrids, int *atm_g) {
+    double *tmp_priv;
+    double total = 0;
+#pragma omp parallel
+    {
+        const int nthreads = omp_get_num_threads();
+        const int ithread = omp_get_thread_num();
+        const int ngrids_local = (ngrids + nthreads - 1) / nthreads;
+        const int ig0 = ithread * ngrids_local;
+        const int ig1 = MIN(ig0 + ngrids_local, ngrids);
+#pragma omp single
+        { tmp_priv = (double *)calloc(nthreads * natm, sizeof(double)); }
+#pragma omp barrier
+        double *my_tmp = tmp_priv + ithread * natm;
+        int ib;
+        int it;
+        int g;
+        for (g = ig0; g < ig1; g++) {
+            my_tmp[atm_g[g]] += f_g[g];
+        }
+#pragma omp barrier
+#pragma omp for reduction(+ : total)
+        for (ib = 0; ib < natm; ib++) {
+            for (it = 0; it < nthreads; it++) {
+                excsum[ib * 3 + v] += tmp_priv[it * natm + ib];
+                total += tmp_priv[it * natm + ib];
+            }
+        }
+    }
+    excsum[a * 3 + v] -= total;
+    free(tmp_priv);
+}
diff --git a/ciderpress/lib/mod_cider/fast_sdmx.c b/ciderpress/lib/mod_cider/fast_sdmx.c
index c9dbefa..f2ad6ac 100644
--- a/ciderpress/lib/mod_cider/fast_sdmx.c
+++ b/ciderpress/lib/mod_cider/fast_sdmx.c
@@ -452,15 +452,15 @@ void SDMXylm_yzx2xyz(int ngrids, int nv, double *ylm_vlg, int *ylm_atom_loc,
 void SDMXcontract_ao_to_bas(int ngrids, double *vbas, double *ylm_lg,
                             double *ao, int *shls_slice, int *ao_loc,
                             int *ylm_atom_loc, int *atm, int natm, int *bas,
-                            int nbas, double *env) {
+                            int nbas, double *env, int nrf, int *rf_loc) {
 #pragma omp parallel
     {
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g, bgrids;
+        int thread, sh, m, nm, di, g, bgrids;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
+        int ia, l, ip, irf;
         double *_vbas;
         double *_ylm;
         double *_ao;
@@ -471,19 +471,20 @@ void SDMXcontract_ao_to_bas(int ngrids, double *vbas, double *ylm_lg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
-                _vbas = vbas + sh * ngrids + ip;
-                for (g = 0; g < bgrids; g++) {
-                    _vbas[g] = 0;
-                }
-                for (m = 0; m < deg; m++) {
-                    _ylm =
-                        ylm_lg + (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
+                nm = 2 * l + 1;
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    _vbas = vbas + irf * ngrids + ip;
                     for (g = 0; g < bgrids; g++) {
-                        // vbas[g] += ylm_lg[(ylm_atom_loc[ia] + l * l + m) *
-                        // ngrids + g] * ao[(ao_loc[sh] + m) * ngrids + g];
-                        _vbas[g] += _ylm[g] * _ao[g];
+                        _vbas[g] = 0;
+                    }
+                    for (m = 0; m < nm; m++, di++) {
+                        _ylm = ylm_lg +
+                               (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _vbas[g] += _ylm[g] * _ao[g];
+                        }
                     }
                 }
             }
@@ -494,16 +495,16 @@ void SDMXcontract_ao_to_bas(int ngrids, double *vbas, double *ylm_lg,
 void SDMXcontract_ao_to_bas_bwd(int ngrids, double *vbas, double *ylm_lg,
                                 double *ao, int *shls_slice, int *ao_loc,
                                 int *ylm_atom_loc, int *atm, int natm, int *bas,
-                                int nbas, double *env) {
+                                int nbas, double *env, int nrf, int *rf_loc) {
 #pragma omp parallel
     {
         // NOTE: This is an in-place operation and ads to ao.
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g, bgrids;
+        int thread, sh, m, g, bgrids, di, nm;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
+        int ia, l, ip, irf;
         double *_vbas;
         double *_ylm;
         double *_ao;
@@ -514,16 +515,17 @@ void SDMXcontract_ao_to_bas_bwd(int ngrids, double *vbas, double *ylm_lg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
-                _vbas = vbas + sh * ngrids + ip;
-                for (m = 0; m < deg; m++) {
-                    _ylm =
-                        ylm_lg + (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
-                    for (g = 0; g < bgrids; g++) {
-                        // vbas[g] += ylm_lg[(ylm_atom_loc[ia] + l * l + m) *
-                        // ngrids + g] * ao[(ao_loc[sh] + m) * ngrids + g];
-                        _ao[g] += _ylm[g] * _vbas[g];
+                nm = 2 * l + 1;
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    _vbas = vbas + irf * ngrids + ip;
+                    for (m = 0; m < nm; m++, di++) {
+                        _ylm = ylm_lg +
+                               (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _ao[g] += _ylm[g] * _vbas[g];
+                        }
                     }
                 }
             }
@@ -534,16 +536,16 @@ void SDMXcontract_ao_to_bas_bwd(int ngrids, double *vbas, double *ylm_lg,
 void SDMXcontract_ao_to_bas_grid(int ngrids, double *vbas, double *ylm_lg,
                                  double *ao, int *shls_slice, int *ao_loc,
                                  int *ylm_atom_loc, int *atm, int natm,
-                                 int *bas, int nbas, double *env, double *gridx,
-                                 double *atomx) {
+                                 int *bas, int nbas, double *env, int nrf,
+                                 int *rf_loc, double *gridx, double *atomx) {
 #pragma omp parallel
     {
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g;
+        int thread, sh, m, g, di, nm;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
+        int ia, l, ip, irf;
         int bgrids;
         double *_vbas;
         double *_ylm;
@@ -557,19 +559,25 @@ void SDMXcontract_ao_to_bas_grid(int ngrids, double *vbas, double *ylm_lg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
+                nm = 2 * l + 1;
                 _vbas = vbas + sh * ngrids + ip;
                 _gridx = gridx + ip;
                 for (g = 0; g < bgrids; g++) {
-                    _vbas[g] = 0;
                     dx[g] = _gridx[g] - atomx[ia];
                 }
-                for (m = 0; m < deg; m++) {
-                    _ylm =
-                        ylm_lg + (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    _vbas = vbas + irf * ngrids + ip;
                     for (g = 0; g < bgrids; g++) {
-                        _vbas[g] += _ylm[g] * _ao[g] * dx[g];
+                        _vbas[g] = 0;
+                    }
+                    for (m = 0; m < nm; m++, di++) {
+                        _ylm = ylm_lg +
+                               (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _vbas[g] += _ylm[g] * _ao[g] * dx[g];
+                        }
                     }
                 }
             }
@@ -578,27 +586,27 @@ void SDMXcontract_ao_to_bas_grid(int ngrids, double *vbas, double *ylm_lg,
     }
 }
 
-void SDMXcontract_ao_to_bas_l1(int ngrids, double *vbas, double *ylm_vlg,
-                               double *ao, int *shls_slice, int *ao_loc,
-                               int *ylm_atom_loc, int *atm, int natm, int *bas,
-                               int nbas, double *env, double *gridx,
-                               double *atomx) {
+void SDMXcontract_ao_to_bas_grid_bwd(int ngrids, double *vbas, double *ylm_lg,
+                                     double *ao, int *shls_slice, int *ao_loc,
+                                     int *ylm_atom_loc, int *atm, int natm,
+                                     int *bas, int nbas, double *env,
+                                     double *gridx, double *atomx, int nrf,
+                                     int *rf_loc) {
 #pragma omp parallel
     {
+        // NOTE: This is an in-place operation and ads to ao.
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g;
+        int thread, sh, m, g, di, nm;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
-        int bgrids;
+        int ia, l, ip;
+        int bgrids, irf;
+        double *_vbas;
+        double *_ylm;
         double *_ao;
-        double *_gridx, *_gridy, *_gridz;
-        double *atomy = atomx + natm;
-        double *atomz = atomx + 2 * natm;
-        double *_vbas0, *_vbas1, *_vbas2, *_vbas3;
-        double *_ylm0, *_ylm1, *_ylm2, *_ylm3;
-        int offset;
+        double *_gridx;
+        double *dx = malloc(sizeof(double) * blksize);
 #pragma omp for
         for (thread = 0; thread < nthread; thread++) {
             ip = blksize * thread;
@@ -606,60 +614,42 @@ void SDMXcontract_ao_to_bas_l1(int ngrids, double *vbas, double *ylm_vlg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
-                offset = sh * ngrids + ip;
-                _vbas0 = vbas + offset;
-                _vbas1 = vbas + 1 * nbas * ngrids + offset;
-                _vbas2 = vbas + 2 * nbas * ngrids + offset;
-                _vbas3 = vbas + 3 * nbas * ngrids + offset;
+                nm = 2 * l + 1;
                 _gridx = gridx + ip;
-                _gridy = gridx + ngrids + ip;
-                _gridz = gridx + 2 * ngrids + ip;
                 for (g = 0; g < bgrids; g++) {
-                    _vbas0[g] = 0;
-                    _vbas1[g] = 0;
-                    _vbas2[g] = 0;
-                    _vbas3[g] = 0;
+                    dx[g] = _gridx[g] - atomx[ia];
                 }
-                for (m = 0; m < deg; m++) {
-                    offset = (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ylm0 = ylm_vlg + offset;
-                    _ylm1 = ylm_vlg + 1 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ylm2 = ylm_vlg + 2 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ylm3 = ylm_vlg + 3 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
-                    for (g = 0; g < bgrids; g++) {
-                        _vbas0[g] += _ylm0[g] * _ao[g];
-                        _vbas1[g] += _ylm1[g] * _ao[g];
-                        _vbas2[g] += _ylm2[g] * _ao[g];
-                        _vbas3[g] += _ylm3[g] * _ao[g];
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    _vbas = vbas + irf * ngrids + ip;
+                    for (m = 0; m < nm; m++, di++) {
+                        _ylm = ylm_lg +
+                               (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _ao[g] += _ylm[g] * _vbas[g] * dx[g];
+                        }
                     }
                 }
-                offset = 3 * nbas * ngrids;
-                for (g = 0; g < bgrids; g++) {
-                    _vbas1[offset + g] = _vbas0[g] * (_gridx[g] - atomx[ia]);
-                    _vbas2[offset + g] = _vbas0[g] * (_gridy[g] - atomy[ia]);
-                    _vbas3[offset + g] = _vbas0[g] * (_gridz[g] - atomz[ia]);
-                }
             }
         }
+        free(dx);
     }
 }
 
-void SDMXcontract_ao_to_bas_l1_bwd(int ngrids, double *vbas, double *ylm_vlg,
-                                   double *ao, int *shls_slice, int *ao_loc,
-                                   int *ylm_atom_loc, int *atm, int natm,
-                                   int *bas, int nbas, double *env,
-                                   double *gridx, double *atomx) {
+void SDMXcontract_ao_to_bas_l1(int ngrids, double *vbas, double *ylm_vlg,
+                               double *ao, int *shls_slice, int *ao_loc,
+                               int *ylm_atom_loc, int *atm, int natm, int *bas,
+                               int nbas, double *env, double *gridx,
+                               double *atomx, int nrf, int *rf_loc) {
 #pragma omp parallel
     {
-        // NOTE: ao must be zero upon entry
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g;
+        int thread, sh, m, g, di, nm;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
+        int ia, l, ip, irf;
         int bgrids;
         double *_ao;
         double *_gridx, *_gridy, *_gridz;
@@ -667,7 +657,6 @@ void SDMXcontract_ao_to_bas_l1_bwd(int ngrids, double *vbas, double *ylm_vlg,
         double *atomz = atomx + 2 * natm;
         double *_vbas0, *_vbas1, *_vbas2, *_vbas3;
         double *_ylm0, *_ylm1, *_ylm2, *_ylm3;
-        double *vb0tmp = malloc(sizeof(double) * blksize);
         int offset;
 #pragma omp for
         for (thread = 0; thread < nthread; thread++) {
@@ -676,34 +665,48 @@ void SDMXcontract_ao_to_bas_l1_bwd(int ngrids, double *vbas, double *ylm_vlg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
-                offset = sh * ngrids + ip;
-                _vbas0 = vbas + offset;
-                _vbas1 = vbas + 1 * nbas * ngrids + offset;
-                _vbas2 = vbas + 2 * nbas * ngrids + offset;
-                _vbas3 = vbas + 3 * nbas * ngrids + offset;
-                _gridx = gridx + ip;
-                _gridy = gridx + ngrids + ip;
-                _gridz = gridx + 2 * ngrids + ip;
-                offset = 3 * nbas * ngrids;
-                for (g = 0; g < bgrids; g++) {
-                    vb0tmp[g] = _vbas0[g];
-                    vb0tmp[g] += _vbas1[offset + g] * (_gridx[g] - atomx[ia]);
-                    vb0tmp[g] += _vbas2[offset + g] * (_gridy[g] - atomy[ia]);
-                    vb0tmp[g] += _vbas3[offset + g] * (_gridz[g] - atomz[ia]);
-                }
-                for (m = 0; m < deg; m++) {
-                    offset = (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ylm0 = ylm_vlg + offset;
-                    _ylm1 = ylm_vlg + 1 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ylm2 = ylm_vlg + 2 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ylm3 = ylm_vlg + 3 * ylm_atom_loc[natm] * ngrids + offset;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
+                nm = 2 * l + 1;
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    offset = irf * ngrids + ip;
+                    _vbas0 = vbas + offset;
+                    _vbas1 = vbas + 1 * nrf * ngrids + offset;
+                    _vbas2 = vbas + 2 * nrf * ngrids + offset;
+                    _vbas3 = vbas + 3 * nrf * ngrids + offset;
+                    _gridx = gridx + ip;
+                    _gridy = gridx + ngrids + ip;
+                    _gridz = gridx + 2 * ngrids + ip;
                     for (g = 0; g < bgrids; g++) {
-                        _ao[g] = _ylm0[g] * vb0tmp[g];
-                        _ao[g] += _ylm1[g] * _vbas1[g];
-                        _ao[g] += _ylm2[g] * _vbas2[g];
-                        _ao[g] += _ylm3[g] * _vbas3[g];
+                        _vbas0[g] = 0;
+                        _vbas1[g] = 0;
+                        _vbas2[g] = 0;
+                        _vbas3[g] = 0;
+                    }
+                    for (m = 0; m < nm; m++, di++) {
+                        offset = (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ylm0 = ylm_vlg + offset;
+                        _ylm1 =
+                            ylm_vlg + 1 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ylm2 =
+                            ylm_vlg + 2 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ylm3 =
+                            ylm_vlg + 3 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _vbas0[g] += _ylm0[g] * _ao[g];
+                            _vbas1[g] += _ylm1[g] * _ao[g];
+                            _vbas2[g] += _ylm2[g] * _ao[g];
+                            _vbas3[g] += _ylm3[g] * _ao[g];
+                        }
+                    }
+                    offset = 3 * nrf * ngrids;
+                    for (g = 0; g < bgrids; g++) {
+                        _vbas1[offset + g] =
+                            _vbas0[g] * (_gridx[g] - atomx[ia]);
+                        _vbas2[offset + g] =
+                            _vbas0[g] * (_gridy[g] - atomy[ia]);
+                        _vbas3[offset + g] =
+                            _vbas0[g] * (_gridz[g] - atomz[ia]);
                     }
                 }
             }
@@ -711,26 +714,30 @@ void SDMXcontract_ao_to_bas_l1_bwd(int ngrids, double *vbas, double *ylm_vlg,
     }
 }
 
-void SDMXcontract_ao_to_bas_grid_bwd(int ngrids, double *vbas, double *ylm_lg,
-                                     double *ao, int *shls_slice, int *ao_loc,
-                                     int *ylm_atom_loc, int *atm, int natm,
-                                     int *bas, int nbas, double *env,
-                                     double *gridx, double *atomx) {
+void SDMXcontract_ao_to_bas_l1_bwd(int ngrids, double *vbas, double *ylm_vlg,
+                                   double *ao, int *shls_slice, int *ao_loc,
+                                   int *ylm_atom_loc, int *atm, int natm,
+                                   int *bas, int nbas, double *env,
+                                   double *gridx, double *atomx, int nrf,
+                                   int *rf_loc) {
 #pragma omp parallel
     {
-        // NOTE: This is an in-place operation and ads to ao.
+        // NOTE: ao must be zero upon entry
         const int nthread = omp_get_num_threads();
         const int blksize = (ngrids + nthread - 1) / nthread;
-        int thread, sh, m, g;
+        int thread, sh, m, g, di, nm;
         int sh0 = shls_slice[0];
         int sh1 = shls_slice[1];
-        int deg, ia, l, ip;
+        int ia, l, ip, irf;
         int bgrids;
-        double *_vbas;
-        double *_ylm;
         double *_ao;
-        double *_gridx;
-        double *dx = malloc(sizeof(double) * blksize);
+        double *_gridx, *_gridy, *_gridz;
+        double *atomy = atomx + natm;
+        double *atomz = atomx + 2 * natm;
+        double *_vbas0, *_vbas1, *_vbas2, *_vbas3;
+        double *_ylm0, *_ylm1, *_ylm2, *_ylm3;
+        double *vb0tmp = malloc(sizeof(double) * blksize);
+        int offset;
 #pragma omp for
         for (thread = 0; thread < nthread; thread++) {
             ip = blksize * thread;
@@ -738,23 +745,48 @@ void SDMXcontract_ao_to_bas_grid_bwd(int ngrids, double *vbas, double *ylm_lg,
             for (sh = sh0; sh < sh1; sh++) {
                 ia = bas[sh * BAS_SLOTS + ATOM_OF];
                 l = bas[sh * BAS_SLOTS + ANG_OF];
-                deg = ao_loc[sh + 1] - ao_loc[sh];
-                _vbas = vbas + sh * ngrids + ip;
-                _gridx = gridx + ip;
-                for (g = 0; g < bgrids; g++) {
-                    dx[g] = _gridx[g] - atomx[ia];
-                }
-                for (m = 0; m < deg; m++) {
-                    _ylm =
-                        ylm_lg + (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
-                    _ao = ao + (ao_loc[sh] + m) * ngrids + ip;
+                nm = 2 * l + 1;
+                di = 0;
+                for (irf = rf_loc[sh]; irf < rf_loc[sh + 1]; irf++) {
+                    offset = irf * ngrids + ip;
+                    _vbas0 = vbas + offset;
+                    _vbas1 = vbas + 1 * nrf * ngrids + offset;
+                    _vbas2 = vbas + 2 * nrf * ngrids + offset;
+                    _vbas3 = vbas + 3 * nrf * ngrids + offset;
+                    _gridx = gridx + ip;
+                    _gridy = gridx + ngrids + ip;
+                    _gridz = gridx + 2 * ngrids + ip;
+                    offset = 3 * nrf * ngrids;
                     for (g = 0; g < bgrids; g++) {
-                        _ao[g] += _ylm[g] * _vbas[g] * dx[g];
+                        vb0tmp[g] = _vbas0[g];
+                        vb0tmp[g] +=
+                            _vbas1[offset + g] * (_gridx[g] - atomx[ia]);
+                        vb0tmp[g] +=
+                            _vbas2[offset + g] * (_gridy[g] - atomy[ia]);
+                        vb0tmp[g] +=
+                            _vbas3[offset + g] * (_gridz[g] - atomz[ia]);
+                    }
+                    for (m = 0; m < nm; m++, di++) {
+                        offset = (ylm_atom_loc[ia] + l * l + m) * ngrids + ip;
+                        _ylm0 = ylm_vlg + offset;
+                        _ylm1 =
+                            ylm_vlg + 1 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ylm2 =
+                            ylm_vlg + 2 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ylm3 =
+                            ylm_vlg + 3 * ylm_atom_loc[natm] * ngrids + offset;
+                        _ao = ao + (ao_loc[sh] + di) * ngrids + ip;
+                        for (g = 0; g < bgrids; g++) {
+                            _ao[g] = _ylm0[g] * vb0tmp[g];
+                            _ao[g] += _ylm1[g] * _vbas1[g];
+                            _ao[g] += _ylm2[g] * _vbas2[g];
+                            _ao[g] += _ylm3[g] * _vbas3[g];
+                        }
                     }
                 }
             }
         }
-        free(dx);
+        free(vb0tmp);
     }
 }
 
@@ -947,10 +979,10 @@ void SDMXeval_loop(void (*fiter)(), FPtr_eval_sdmx feval, FPtr_exp_sdmx fexp,
 }
 
 void SDMXeval_rad_loop(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
-                       int ngrids, int param[], int *shls_slice, double *vbas,
-                       double *coord, uint8_t *non0table, int *atm, int natm,
-                       int *bas, int nbas, double *env, double *alphas,
-                       double *alpha_norms, int nalpha) {
+                       int ngrids, int param[], int *shls_slice, int *rf_loc,
+                       double *vbas, double *coord, uint8_t *non0table,
+                       int *atm, int natm, int *bas, int nbas, double *env,
+                       double *alphas, double *alpha_norms, int nalpha) {
     int shloc[shls_slice[1] - shls_slice[0] + 1];
     const int nshlblk = SDMXshloc_by_atom(shloc, shls_slice, atm, bas);
     const int nblk = (ngrids + BLKSIZE - 1) / BLKSIZE;
@@ -959,7 +991,7 @@ void SDMXeval_rad_loop(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
     {
         const int sh0 = shls_slice[0];
         const int sh1 = shls_slice[1];
-        const size_t nao = sh1 - sh0;
+        const size_t nao = rf_loc[sh1] - rf_loc[sh0];
         int ip, ib, k, iloc, ish;
         size_t soff, bgrids;
         int ncart = NCTR_CART * param[TENSOR] * param[POS_E1];
@@ -969,14 +1001,15 @@ void SDMXeval_rad_loop(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
         for (k = 0; k < nblk * nshlblk; k++) {
             iloc = k / nblk;
             ish = shloc[iloc];
-            soff = ish - sh0;
+            soff = rf_loc[ish] - rf_loc[sh0];
             ib = k - iloc * nblk;
             ip = ib * BLKSIZE;
             bgrids = MIN(ngrids - ip, BLKSIZE);
             SDMXeval_rad_iter(feval, fexp, fac, nao, Ngrids, bgrids, param,
-                              shloc + iloc, buf, vbas + soff * Ngrids + ip,
-                              coord + ip, non0table + ib * nbas, atm, natm, bas,
-                              nbas, env, alphas, alpha_norms, nalpha);
+                              shloc + iloc, rf_loc, buf,
+                              vbas + soff * Ngrids + ip, coord + ip,
+                              non0table + ib * nbas, atm, natm, bas, nbas, env,
+                              alphas, alpha_norms, nalpha);
         }
         free(buf);
     }
@@ -995,18 +1028,14 @@ void SDMXeval_sph_iter(FPtr_eval_sdmx feval, FPtr_exp_sdmx fexp, double fac,
     const int atmstart = bas[sh0 * BAS_SLOTS + ATOM_OF];
     const int atmend = bas[(sh1 - 1) * BAS_SLOTS + ATOM_OF] + 1;
     const int atmcount = atmend - atmstart;
-    int i, k, l, np, nc, atm_id, bas_id, deg, dcart, ao_id;
-    size_t di;
+    int i, l, np, nc, atm_id, bas_id, deg, ao_id;
     double fac1;
-    double *p_exp, *pcoeff, *pcoord, *pcart, *ri, *pao;
+    double *p_exp, *pcoeff, *pcoord, *ri;
     double *grid2atm = ALIGN8_UP(buf); // [atm_id,xyz,grid]
     double *ylm_buf = ALIGN8_UP(ybuf);
-    // double *lbuf = ALIGN8_UP(_lbuf); //
     double *eprim = grid2atm + atmcount * 3 * BLKSIZE;
-    double *cart_gto = eprim + NPRIMAX * BLKSIZE * 2;
     int n_alm = ylm_atom_loc[natm];
     double *ylm_vmg;
-    int ialpha;
     int m, g, v;
 
     _fill_grid2atm(grid2atm, coord, bgrids, ngrids, atm + atmstart * ATM_SLOTS,
@@ -1044,9 +1073,8 @@ void SDMXeval_sph_iter(FPtr_eval_sdmx feval, FPtr_exp_sdmx fexp, double fac,
                          env, l, np, nc, nao, ngrids, bgrids, ylm_buf,
                          deg * BLKSIZE);
             } else {
-                printf("Hi sph\n");
-                for (i = 0; i < ncomp;
-                     i++) { // TODO this might not set everything to zero
+                for (i = 0; i < ncomp; i++) {
+                    // TODO this might not set everything to zero
                     _dset0(ao + (i * nao + ao_id) * ngrids, ngrids, bgrids,
                            nc * deg);
                 }
@@ -1058,7 +1086,7 @@ void SDMXeval_sph_iter(FPtr_eval_sdmx feval, FPtr_exp_sdmx fexp, double fac,
 
 void SDMXeval_rad_iter(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
                        size_t nao, size_t ngrids, size_t bgrids, int param[],
-                       int *shls_slice, double *buf, double *vbas,
+                       int *shls_slice, int *rf_loc, double *buf, double *vbas,
                        double *coord, uint8_t *non0table, int *atm, int natm,
                        int *bas, int nbas, double *env, double *alphas,
                        double *alpha_norms, int nalpha) {
@@ -1068,13 +1096,12 @@ void SDMXeval_rad_iter(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
     const int atmstart = bas[sh0 * BAS_SLOTS + ATOM_OF];
     const int atmend = bas[(sh1 - 1) * BAS_SLOTS + ATOM_OF] + 1;
     const int atmcount = atmend - atmstart;
-    int i, k, l, np, nc, atm_id, bas_id, deg, dcart, ao_id;
-    size_t di;
+    int i, k, l, np, nc, atm_id, bas_id;
     double fac1;
-    double *p_exp, *pcoeff, *pcoord, *pcart, *ri, *pao;
+    double *p_exp, *pcoeff, *pcoord;
     double *grid2atm = ALIGN8_UP(buf); // [atm_id,xyz,grid]
     double *eprim = grid2atm + atmcount * 3 * BLKSIZE;
-    int ialpha, sh;
+    int sh;
 
     _fill_grid2atm(grid2atm, coord, bgrids, ngrids, atm + atmstart * ATM_SLOTS,
                    atmcount, bas, nbas, env);
@@ -1083,27 +1110,24 @@ void SDMXeval_rad_iter(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
         np = bas[bas_id * BAS_SLOTS + NPRIM_OF];
         nc = bas[bas_id * BAS_SLOTS + NCTR_OF];
         l = bas[bas_id * BAS_SLOTS + ANG_OF];
-        deg = l * 2 + 1;
         fac1 = fac; // * CINTcommon_fac_sp(l);
         p_exp = env + bas[bas_id * BAS_SLOTS + PTR_EXP];
         pcoeff = env + bas[bas_id * BAS_SLOTS + PTR_COEFF];
         atm_id = bas[bas_id * BAS_SLOTS + ATOM_OF];
         pcoord = grid2atm + (atm_id - atmstart) * 3 * BLKSIZE;
-        sh = bas_id - sh0;
+        sh = rf_loc[bas_id] - rf_loc[sh0];
         for (int ialpha = 0; ialpha < nalpha; ialpha++) {
             if (non0table[bas_id] &&
                 (*fexp)(eprim, pcoord, p_exp, pcoeff, l, np, nc, bgrids, fac1,
                         alphas[ialpha], alpha_norms[ialpha])) {
-                // ri = env + atm[PTR_COORD+atm_id*ATM_SLOTS];
-                // lbuf[i + BLKSIZE * (m + max2lp1 * v)] =
-                //     ylm_vlg[i + ngrids * (m + n_alm * v)];
                 (*feval)(vbas + sh * ngrids, eprim, nc, nao, ngrids, bgrids,
                          nao * ngrids * nalpha);
             } else {
-                printf("Hi rad\n");
                 for (i = 0; i < ncomp; i++) {
-                    _dset0(vbas + (i * nalpha * nao + sh) * ngrids, ngrids,
-                           bgrids, nc);
+                    for (k = 0; k < rf_loc[bas_id + 1] - rf_loc[bas_id]; k++) {
+                        _dset0(vbas + (i * nalpha * nao + sh + k) * ngrids,
+                               ngrids, bgrids, nc);
+                    }
                 }
             }
             sh += nao;
@@ -1156,7 +1180,6 @@ void SDMXshell_eval_grid_cart(double *gto, double *ri, double *exps,
         for (m = 0; m < l2p1; m++) {
 #pragma GCC ivdep
             for (i = 0; i < blksize; i++) {
-                // gto[i] = ylm_vmg[m * ngrids + i] * exps[k * BLKSIZE + i];
                 gto[i] = ylm_vmg[m * BLKSIZE + i] * exps[k * BLKSIZE + i];
             }
             gto += ngrids;
@@ -1189,14 +1212,6 @@ void SDMXshell_eval_grid_cart_deriv1(double *gto, double *ri, double *exps,
         for (m = 0; m < l2p1; m++) {
 #pragma GCC ivdep
             for (i = 0; i < blksize; i++) {
-                /*tmp = y0_mg[m * ngrids + i] * exps_2a[k * BLKSIZE + i];
-                gto[i] = y0_mg[m * ngrids + i] * exps[k * BLKSIZE + i];
-                gtox[i] = tmp * gridx[i];
-                gtoy[i] = tmp * gridy[i];
-                gtoz[i] = tmp * gridz[i];
-                gtox[i] += yx_mg[m * ngrids + i] * exps[k * BLKSIZE + i];
-                gtoy[i] += yy_mg[m * ngrids + i] * exps[k * BLKSIZE + i];
-                gtoz[i] += yz_mg[m * ngrids + i] * exps[k * BLKSIZE + i];*/
                 tmp = y0_mg[m * BLKSIZE + i] * exps_2a[k * BLKSIZE + i];
                 gto[i] = y0_mg[m * BLKSIZE + i] * exps[k * BLKSIZE + i];
                 gtox[i] = tmp * gridx[i];
diff --git a/ciderpress/lib/mod_cider/fast_sdmx.h b/ciderpress/lib/mod_cider/fast_sdmx.h
index bde761b..21b2fe9 100644
--- a/ciderpress/lib/mod_cider/fast_sdmx.h
+++ b/ciderpress/lib/mod_cider/fast_sdmx.h
@@ -25,7 +25,7 @@
 
 void SDMXeval_rad_iter(FPtr_eval_sdmx_rad feval, FPtr_exp_sdmx fexp, double fac,
                        size_t nao, size_t ngrids, size_t bgrids, int param[],
-                       int *shls_slice, double *buf, double *vbas,
+                       int *shls_slice, int *rf_loc, double *buf, double *vbas,
                        double *coord, uint8_t *non0table, int *atm, int natm,
                        int *bas, int nbas, double *env, double *alphas,
                        double *alpha_norms, int nalpha);
diff --git a/ciderpress/lib/mod_cider/model_utils.c b/ciderpress/lib/mod_cider/model_utils.c
index ea095d0..9cf4dec 100644
--- a/ciderpress/lib/mod_cider/model_utils.c
+++ b/ciderpress/lib/mod_cider/model_utils.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include <math.h>
 #include <omp.h>
 #include <stdlib.h>
diff --git a/ciderpress/lib/pwutil/gpaw_interface.c b/ciderpress/lib/pwutil/gpaw_interface.c
index 397870a..6bd8e20 100644
--- a/ciderpress/lib/pwutil/gpaw_interface.c
+++ b/ciderpress/lib/pwutil/gpaw_interface.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include "gpaw_interface.h"
 #include <Python.h>
 #include <mpi.h>
diff --git a/ciderpress/lib/pwutil/gpaw_interface.h b/ciderpress/lib/pwutil/gpaw_interface.h
index 9a08ca8..8b5c071 100644
--- a/ciderpress/lib/pwutil/gpaw_interface.h
+++ b/ciderpress/lib/pwutil/gpaw_interface.h
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #ifndef GPAW_INTERFACE_H
 #define GPAW_INTERFACE_H
 
diff --git a/ciderpress/lib/pwutil/grid_util.c b/ciderpress/lib/pwutil/grid_util.c
index 130343a..726ef61 100644
--- a/ciderpress/lib/pwutil/grid_util.c
+++ b/ciderpress/lib/pwutil/grid_util.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include <complex.h>
 #include <math.h>
 
diff --git a/ciderpress/lib/pwutil/nldf_fft_core.c b/ciderpress/lib/pwutil/nldf_fft_core.c
index 4f0f6cf..aba9427 100644
--- a/ciderpress/lib/pwutil/nldf_fft_core.c
+++ b/ciderpress/lib/pwutil/nldf_fft_core.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include "nldf_fft_core.h"
 #include "config.h"
 #include <assert.h>
diff --git a/ciderpress/lib/pwutil/nldf_fft_core.h b/ciderpress/lib/pwutil/nldf_fft_core.h
index 2532805..4ffe755 100644
--- a/ciderpress/lib/pwutil/nldf_fft_core.h
+++ b/ciderpress/lib/pwutil/nldf_fft_core.h
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #ifndef NLDF_FFT_CORE_H
 #define NLDF_FFT_CORE_H
 #include "config.h"
diff --git a/ciderpress/lib/pwutil/nldf_fft_mpi.c b/ciderpress/lib/pwutil/nldf_fft_mpi.c
index 3bdc2ff..627ddec 100644
--- a/ciderpress/lib/pwutil/nldf_fft_mpi.c
+++ b/ciderpress/lib/pwutil/nldf_fft_mpi.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include "nldf_fft_mpi.h"
 #include "config.h"
 #include "gpaw_interface.h"
diff --git a/ciderpress/lib/pwutil/nldf_fft_mpi.h b/ciderpress/lib/pwutil/nldf_fft_mpi.h
index a5cf448..c25adf1 100644
--- a/ciderpress/lib/pwutil/nldf_fft_mpi.h
+++ b/ciderpress/lib/pwutil/nldf_fft_mpi.h
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #ifndef NLDF_FFT_MPI_H
 #define NLDF_FFT_MPI_H
 #include "nldf_fft_core.h"
diff --git a/ciderpress/lib/pwutil/nldf_fft_serial.c b/ciderpress/lib/pwutil/nldf_fft_serial.c
index 7d76b3b..b86e84a 100644
--- a/ciderpress/lib/pwutil/nldf_fft_serial.c
+++ b/ciderpress/lib/pwutil/nldf_fft_serial.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include "nldf_fft_serial.h"
 #include "config.h"
 #include "nldf_fft_core.h"
diff --git a/ciderpress/lib/pwutil/nldf_fft_serial.h b/ciderpress/lib/pwutil/nldf_fft_serial.h
index 0423013..1b11114 100644
--- a/ciderpress/lib/pwutil/nldf_fft_serial.h
+++ b/ciderpress/lib/pwutil/nldf_fft_serial.h
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #ifndef NLDF_FFT_SERIAL_H
 #define NLDF_FFT_SERIAL_H
 #include "nldf_fft_core.h"
diff --git a/ciderpress/lib/xc_utils/libxc_baselines.c b/ciderpress/lib/xc_utils/libxc_baselines.c
index dd7b533..c3cf810 100644
--- a/ciderpress/lib/xc_utils/libxc_baselines.c
+++ b/ciderpress/lib/xc_utils/libxc_baselines.c
@@ -1,3 +1,22 @@
+// CiderPress: Machine-learning based density functional theory calculations
+// Copyright (C) 2024 The President and Fellows of Harvard College
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+//
+// Author: Kyle Bystrom <kylebystrom@gmail.com>
+//
+
 #include <xc.h>
 
 void get_lda_baseline(int fn_id, int nspin, int size, double *rho, double *exc,
diff --git a/ciderpress/models/train.py b/ciderpress/models/train.py
index ee09cb0..05bb34c 100644
--- a/ciderpress/models/train.py
+++ b/ciderpress/models/train.py
@@ -115,7 +115,7 @@ def __init__(
 
     def map(self, mapping_plans):
         """
-        Map the MOLGP model to an Evaluator objec that can efficiently
+        Map the MOLGP model to an Evaluator object that can efficiently
         evaluate the XC energy and which can also be serialized.
 
         Returns:
@@ -672,7 +672,7 @@ def _compute_mol_covs(
 
     def map(self, mapping_plans):
         """
-        Map the MOLGP model to an Evaluator objec that can efficiently
+        Map the MOLGP model to an Evaluator object that can efficiently
         evaluate the XC energy and which can also be serialized.
 
         Returns:
diff --git a/ciderpress/pyscf/descriptors.py b/ciderpress/pyscf/descriptors.py
index 97e865c..a474cba 100644
--- a/ciderpress/pyscf/descriptors.py
+++ b/ciderpress/pyscf/descriptors.py
@@ -301,7 +301,10 @@ def get_full_rho(ni, mol, dms, grids, xctype):
     for ao, mask, weight, coords in ni.block_loop(
         mol, grids, nao, ao_deriv, max_memory
     ):
-        mask = None  # TODO remove once screening is fixed
+        # NOTE using the mask causes issues for some reason.
+        # We don't need the mask anyway because is not
+        # performance-critical code
+        mask = None
         ip1 = ip0 + weight.size
         for idm in range(nset):
             rho = make_rho(idm, ao, mask, xctype)
@@ -324,7 +327,10 @@ def get_full_rho_with_fl(ni, mol, dms, grids):
     for ao, mask, weight, coords in ni.block_loop(
         mol, grids, nao, ao_deriv, max_memory
     ):
-        mask = None  # TODO remove once screening is fixed
+        # NOTE using the mask causes issues for some reasons
+        # We don't need the mask anyway because is not
+        # performance-critical code
+        mask = None
         ip1 = ip0 + weight.size
         for idm in range(nset):
             rho = make_rho(idm, ao, mask, xctype)
@@ -461,7 +467,6 @@ def _sdmx_desc_getter(mol, pgrids, dm, settings, coeffs=None, **kwargs):
         maxmem //= 4
         maxmem += 1
     for ao, mask, weight, coords in ni.block_loop(mol, pgrids, nao, 0, maxmem):
-        # TODO implement screening
         ip1 = ip0 + weight.size
         if coeffs is None or len(coeffs) == 0:
             desc[:, ip0:ip1] = exxgen.get_features(dm, mol, coords)
diff --git a/ciderpress/pyscf/frac_lapl.py b/ciderpress/pyscf/frac_lapl.py
index 545d798..f4607d1 100644
--- a/ciderpress/pyscf/frac_lapl.py
+++ b/ciderpress/pyscf/frac_lapl.py
@@ -62,7 +62,6 @@
 
 class FracLaplBuf:
     def __init__(self, a, d, size, lmax, s):
-        print("INITIALIZING FLAPL", s)
         self.a = a
         self.d = d
         self.size = size
@@ -474,7 +473,6 @@ def _odp_dot_sparse_(
         # vmat = _dot_ao_ao_sparse(ao[0], aow1, None, nbins, mask, pair_mask,
         #                         ao_loc, hermi=0, out=vmat)
         aow2 = _scale_ao_sparse(kao, wv[5 : 5 + nk0], None, ao_loc, out=aow2)
-        print(np.sum(aow2), np.sum(aow1))
         aow1 += aow2
         vmat = _dot_ao_ao_sparse(
             ao[0], aow1, None, nbins, None, pair_mask, ao_loc, hermi=0, out=vmat
diff --git a/ciderpress/pyscf/gen_cider_grid.py b/ciderpress/pyscf/gen_cider_grid.py
index 9e24a45..01adc23 100644
--- a/ciderpress/pyscf/gen_cider_grid.py
+++ b/ciderpress/pyscf/gen_cider_grid.py
@@ -57,7 +57,6 @@ class CiderGrids(Grids):
 
     def __init__(self, mol, lmax=CIDER_DEFAULT_LMAX):
         super(CiderGrids, self).__init__(mol)
-        # self.becke_scheme = becke_lko
         self.lmax = lmax
         self.nlm = (lmax + 1) * (lmax + 1)
         self.grids_indexer = None
@@ -121,13 +120,9 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
             build_indexer=True,
             **kwargs
         )
-        # TODO cleaner version of this way of calling VXCgen_grid_lko
-        # tmp = libdft.VXCgen_grid
-        # libdft.VXCgen_grid = libcider.VXCgen_grid_lko
         self.coords, self.weights = self.get_partition(
             mol, atom_grids_tab, self.radii_adjust, self.atomic_radii, self.becke_scheme
         )
-        # libdft.VXCgen_grid = tmp
         self.grids_indexer.set_weights(self.weights)
 
         if sort_grids:
diff --git a/ciderpress/pyscf/numint.py b/ciderpress/pyscf/numint.py
index d648c98..d8f5321 100644
--- a/ciderpress/pyscf/numint.py
+++ b/ciderpress/pyscf/numint.py
@@ -945,7 +945,6 @@ def block_loop(
             coords = grids.coords[ip0:ip1]
             weight = grids.weights[ip0:ip1]
             mask = screen_index[ip0 // BLKSIZE :]
-            # TODO: pass grids.cutoff to eval_ao
             ao = self.eval_ao(
                 mol, coords, deriv=deriv, non0tab=mask, cutoff=grids.cutoff, out=buf
             )
@@ -1026,7 +1025,6 @@ def block_loop(
             coords = grids.coords[ip0:ip1]
             weight = grids.weights[ip0:ip1]
             mask = screen_index[ip0 // BLKSIZE :]
-            # TODO: pass grids.cutoff to eval_ao
             ao_buf = np.ndarray((comp + kcomp, weight.size * nao), buffer=buf)
             ao = ni.eval_ao(
                 mol, coords, deriv=deriv, non0tab=mask, cutoff=grids.cutoff, out=ao_buf
diff --git a/ciderpress/pyscf/pbc/dft.py b/ciderpress/pyscf/pbc/dft.py
index 4909085..fef0c20 100644
--- a/ciderpress/pyscf/pbc/dft.py
+++ b/ciderpress/pyscf/pbc/dft.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 from pyscf import lib
 
 from ciderpress.pyscf.dft import _CiderKS as _MolCiderKS
@@ -59,7 +79,6 @@ def set_mlxc(
         rhocut=None,
         nlc_coeff=None,
     ):
-        print(mlxc.settings, dir(mlxc.settings))
         if nldf_init is None and mlxc.settings.has_nldf:
             raise NotImplementedError
         if sdmx_init is None and mlxc.settings.has_sdmx:
diff --git a/ciderpress/pyscf/pbc/numint.py b/ciderpress/pyscf/pbc/numint.py
index 26a7981..54949ab 100644
--- a/ciderpress/pyscf/pbc/numint.py
+++ b/ciderpress/pyscf/pbc/numint.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import sys
 
 import numpy as np
diff --git a/ciderpress/pyscf/pbc/sdmx_fft.py b/ciderpress/pyscf/pbc/sdmx_fft.py
index 842e3f2..b0cee15 100644
--- a/ciderpress/pyscf/pbc/sdmx_fft.py
+++ b/ciderpress/pyscf/pbc/sdmx_fft.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import ctypes
 import time
 
@@ -101,24 +121,15 @@ def _gauss_diff_with_cutoff(a, r, k2):
     k = np.sqrt(k2)
     arg = (1j * k + 2 * a * r) / (2 * np.sqrt(a))
     arg2 = (1j * k - 2 * a * r) / (2 * np.sqrt(a))
-    print(np.isnan(arg).any())
+    if np.isnan(arg).any():
+        raise RuntimeError("nan values in internal CiderPress routine")
     tmp = erf(arg).real - erf(arg2).real
-    print(np.abs(tmp).sum())
     tmp = tmp * (k * np.sqrt(np.pi) * np.exp(arg * arg))
-    print("hi2", np.isnan(tmp).any())
-    g = (
-        2j
-        * np.sqrt(a)
-        * (np.exp(2j * k * r) - 1)
-        # + np.exp(arg * arg) * tmp
-    )
-    print("hi", np.isnan(g).any())
-    print(np.abs(g).sum(), np.abs(k).sum(), np.abs((np.exp(2j * k * r) - 1)).sum())
+    g = 2j * np.sqrt(a) * (np.exp(2j * k * r) - 1)
     g[:] *= np.exp(-r * (1j * k + a * r)) / (8 * a**1.5 * k)
     g[k2 < 1e-8] = np.sqrt(np.pi) / (4 * a**1.5) * erf(np.sqrt(a) * r) - np.exp(
         -a * r * r
     ) * r / (2 * a)
-    print(np.abs(g).sum())
     return g
 
 
@@ -197,17 +208,9 @@ def precompute_all_convolutions(cell, kpts, alphas, alpha_norms, itype, kpt_dim=
 
 
 def get_ao_and_aok(cell, coords, kpt, deriv):
-    t0 = time.monotonic()
     ao = eval_ao_kpts(cell, coords, kpt, deriv=deriv)[0]
-    t1 = time.monotonic()
     expir = np.exp(-1j * np.dot(coords, kpt))
     aokG = np.asarray(tools.fftk(np.asarray(ao.T, order="C"), cell.mesh, expir))
-    t2 = time.monotonic()
-    # expir = np.exp(1j*np.dot(coords, kpt))
-    # aokG = np.asarray(
-    #     tools.fftk(np.asarray(ao.conj().T, order='C'), cell.mesh, expir)
-    # )
-    print("ORB TIMES", t1 - t0, t2 - t1)
     return ao, aokG
 
 
@@ -297,7 +300,6 @@ def get_recip_convolutions(
                     kcell.lattice_vectors(), dtype=np.float64, order="C"
                 )
                 mesh = np.asarray(kcell.mesh, dtype=np.int32, order="C")
-                # print(mesh, maxqv, vq.dtype)
                 libcider.recip_conv_kernel_ws(
                     conv_kG.ctypes.data_as(ctypes.c_void_p),
                     vq.ctypes.data_as(ctypes.c_void_p),
@@ -308,7 +310,6 @@ def get_recip_convolutions(
                     ctypes.c_int(kG.shape[0]),
                     ctypes.c_int(vq.size),
                 )
-                # print(np.where(np.isnan(conv_kG)), np.sum(np.isnan(conv_kG)))
             else:
                 raise ValueError
         else:
@@ -442,13 +443,10 @@ def compute_sdmx_tensor(
         t1s = time.monotonic()
         # ao, aokG = get_ao_and_aok(cell, coords, kpt, 0)
         ao, aokG = get_ao_and_aok_fast(cell, coords, kpt, 0)
-        ta = time.monotonic()
         ao = ao * np.exp(-1j * np.dot(coords, kpt))[:, None]
         # After above op, `ao` var contains conj(ao) e^ikr, which is periodic
         # because conj(ao) = (periodic part) * e^-ikr
-        tb = time.monotonic()
         c0 = _dot_ao_dm(cell, ao, dms[kn], None, shls_slice, ao_loc)
-        tc = time.monotonic()
         if has_l1:
             c1 = fft_grad_fast(c0.T, cell.mesh, Gv).transpose(0, 2, 1)
             c1[0] += 1j * kpt[0] * c0
@@ -456,7 +454,6 @@ def compute_sdmx_tensor(
             c1[2] += 1j * kpt[2] * c0
         Gk = Gv + kpt
         Gk2 = lib.einsum("gv,gv->g", Gk, Gk)
-        td = time.monotonic()
         t1e = time.monotonic()
         t2s = time.monotonic()
         for ialpha in range(nalpha):
@@ -479,7 +476,6 @@ def compute_sdmx_tensor(
         t2e = time.monotonic()
         t1 += t1e - t1s
         t2 += t2e - t2s
-        print("LOOP TIMES", t1s - ta, tb - ta, tc - tb, td - tc)
     t3s = time.monotonic()
     tmp[:] /= len(kpts)
     if has_l1:
@@ -487,17 +483,13 @@ def compute_sdmx_tensor(
         tmp[1] += tmp1[0]
         tmp[2] += tmp1[1]
         tmp[3] += tmp1[2]
-    # if not has_l1:
-    #     tmp = tmp[0]
     t3e = time.monotonic()
     print("SDMX TIMES", t0e - t0s, t1, t2, t3e - t3s)
     return tmp
 
 
 def _get_grid_info(cell, kpts, dms):
-    # if False:  # len(kpts) == 1 and np.linalg.norm(kpts) < 1e-16 and dms[0].dtype == np.float64:
     if len(kpts) == 1 and np.linalg.norm(kpts) < 1e-16 and dms[0].dtype == np.float64:
-        print("GAMMA ONLY ROUTINE")
         rtype = np.float64
         nz = cell.mesh[2] // 2 + 1
         assert cell.Gv.ndim == 2
@@ -507,7 +499,6 @@ def _get_grid_info(cell, kpts, dms):
                 ..., :nz, :
             ].reshape(-1, 3)
         )
-        # Gv[:, 2] = np.abs(Gv[:, 2])
         ng_recip = Gv.shape[0]
         ng_real = ng_recip * 2
         mesh = cell.mesh
@@ -516,11 +507,9 @@ def _get_grid_info(cell, kpts, dms):
         # rz = np.fft.fftfreq(mesh[2], 1./mesh[2])
         rz = np.arange(nz)
         b = cell.reciprocal_vectors()
-        abs(np.linalg.det(b))
         Gvbase = (rx, ry, rz)
         Gv = np.dot(lib.cartesian_prod(Gvbase), b)
     else:
-        print("MULTI-KPT ROUTINE")
         rtype = np.complex128
         Gv = cell.Gv
         ng_recip = Gv.shape[0]
@@ -549,20 +538,6 @@ def _zero_even_edges_fft(x, mesh):
     )
 
 
-def print_unfolded_sum(x, mesh):
-    if x.shape[-1] == np.prod(mesh):
-        print(np.sum(np.abs(x)))
-    elif x.shape[-1] == mesh[0] * mesh[1] * (mesh[2] // 2 + 1):
-        x = x.copy()
-        shape = x.shape
-        x.shape = (-1, x.shape[-1])
-        _weight_symm_gpts(x, mesh)
-        x.shape = shape
-        print(2 * np.sum(np.abs(x)))
-    else:
-        print(np.sum(np.abs(x)))
-
-
 def _get_igk(Gk, rtype, mesh, zero_even_edge=None):
     # default zero_even_edge is True for rtype=np.float64,
     # False for rtype=np.complex128
@@ -840,26 +815,18 @@ def compute_sdmx_tensor_mo(
             nmo = cmo_lists[s][kn].shape[0]
             ck_all = np.ndarray((nmo, ng_recip), dtype=np.complex128, buffer=ck_all_buf)
             lib.dot(cmo_lists[s][kn], aoG, c=ck_all)
-            if dms is not None:
-                dm = cmo_lists[s][kn]
-                dmtmp = dm.conj().T.dot(dm)
-                print("DMDIFF", np.linalg.norm(dms[kn] - dmtmp))
             for i0, i1 in lib.prange(0, nmo, nmo_per_block):
-                time.monotonic()
                 ck = np.ndarray(
                     (ncpa, i1 - i0, ng_recip), dtype=np.complex128, buffer=bufs[:ncpa]
                 )
                 cr = np.ndarray(
                     (ncpa, i1 - i0, ng_real), dtype=rtype, buffer=bufs[:ncpa]
                 )
-                time.monotonic()
                 ck[0] = ck_all[i0:i1]
-                time.monotonic()
                 if has_l1:
                     _mul_z(ck[0], igk[:, 0], ck[1])
                     _mul_z(ck[0], igk[:, 1], ck[2])
                     _mul_z(ck[0], igk[:, 2], ck[3])
-                time.monotonic()
                 shape = ck.shape
                 ck.shape = (ncpa * (i1 - i0), ng_recip)
                 fft_fast(ck, cell.mesh, fwd=False, inplace=True)
@@ -870,7 +837,6 @@ def compute_sdmx_tensor_mo(
                 conv_aor = np.ndarray(
                     (i1 - i0, ng_real), dtype=rtype, buffer=bufs[ncpa]
                 )
-                time.monotonic()
                 tc, td = 0, 0
                 for ialpha in range(nalpha):
                     # conv_ao[:] = ck * conv_aG[ialpha]
@@ -1011,7 +977,6 @@ def compute_sdmx_tensor_vxc(
     spinpol=False,
 ):
     # NOTE: vgrid should have weight applied already
-    print("STARTING VXC")
     if spinpol:
         nspin = len(vmats)
         vmat_list = vmats
@@ -1249,7 +1214,6 @@ def from_settings_and_mol(
             max_exp = np.max(cell._env[cell._bas[:, PTR_EXP]])
             nalpha = 1 + int(1 + np.log(max_exp / alpha0) / np.log(lambd))
             nalpha += 2  # buffer exponents
-        print("NALPHA", nalpha)
         if isinstance(settings, SDMXFullSettings):
             plan = SDMXFullPlan(settings, nspin, alpha0, lambd, nalpha)
         elif isinstance(settings, SDMXSettings):
@@ -1282,7 +1246,6 @@ def get_dm_convolutions(
         old_kdim = self._kpt_dim
         self.set_kpt_dim(tools.pbc.get_monkhorst_pack_size(cell, kpts))
         if old_kdim is None or not (old_kdim == self._kpt_dim).all():
-            print("PRECOMPUTING CONVOLUTIONS")
             self._precomp_list = precompute_all_convolutions(
                 cell,
                 kpts,
@@ -1329,7 +1292,6 @@ def get_dm_convolutions(
         if kpts is None:
             kpts = np.zeros((1, 3))
         if has_mo:
-            print("MO FEAT GEN")
             tmps = compute_sdmx_tensor_mo(
                 mo_coeff,
                 mo_occ,
@@ -1346,7 +1308,6 @@ def get_dm_convolutions(
                 spinpol=spinpol,
             )
         else:
-            print("DM FEAT GEN")
             tmps = compute_sdmx_tensor_lowmem(
                 dms,
                 cell,
diff --git a/ciderpress/pyscf/pbc/tests/test_fft.py b/ciderpress/pyscf/pbc/tests/test_fft.py
index 6b6a726..122f7b7 100644
--- a/ciderpress/pyscf/pbc/tests/test_fft.py
+++ b/ciderpress/pyscf/pbc/tests/test_fft.py
@@ -1,7 +1,28 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import ctypes
 
 import numpy as np
 from numpy.fft import fftn, ifftn, irfftn, rfftn
+from numpy.testing import assert_allclose
 
 from ciderpress.pyscf.pbc.sdmx_fft import libcider
 
@@ -52,78 +73,34 @@ def main():
         xkin.imag = xkin2
         xkin[:] = rfftn(xrin, norm="backward")
         xkc = fftn(xrin.astype(np.complex128), norm="backward")
-        # xkin.imag[..., 0] = 0
-        # if mesh[-1] % 2 == 0:
-        #     xkin.imag[..., -1] = 0
         xkin2 = xkin.copy()
-        if False:  # (np.array(mesh) % 2 == 0).all():
-            # xkin2[0, 0, -1] += 1j
-            # xkin2[0, 0, 0] += 1j
-            # xkin2[mesh[0]//2, mesh[1]//2, -1] += 50j
-            cx = mesh[0] // 2
-            cy = mesh[1] // 2
-            const = 5j
-            xkin2[cx, cy, 0] = const
-            xkin2[-cx, -cy, 0] = np.conj(const)
-            # print(mesh[0]//3, mesh[0])
-            xkc[cx, cy, 0] = const
-            xkc[-cx, -cy, 0] = np.conj(const)
-            # xkin2[mesh[0]//2, mesh[1]//2, 0] += 1j
-        if True:
+        if mesh[2] % 2 == 0:
             xkin2[0, 0, 0] = xkin2.real[0, 0, 0]
             xkin2[0, 0, -1] = xkin2.real[0, 0, -1]
             for ind in [0, -1]:
-                # for i in range(1, (xkin2.shape[-2] + 1) // 2 ):
                 for i in range(xkin2.shape[-3]):
                     for j in range(xkin2.shape[-2]):
-                        # print(i, j, xkin2.shape[-2] - i, xkin2.shape[-1] - j)
                         tmp1 = xkin2[i, j, ind]
                         tmp2 = xkin2[-i, -j, ind]
                         xkin2[i, j, ind] = 0.5 * (tmp1 + tmp2.conj())
                         xkin2[-i, -j, ind] = 0.5 * (tmp1.conj() + tmp2)
-                # xkin_tmp = xkin2[1:, 1:, ind][::-1, ::-1].conj().copy()
-                # xkin2[1:, 1:, ind] += xkin_tmp
-                # xkin2[1:, 1:, ind] *= 0.5
-        # xkc = np.zeros(mesh, dtype=np.complex128)
-        # xkc[..., :kmesh[2]] = xkin2
-        # xkc[..., 0] = xkin[..., 0]
-        # for i in range(1, (xkc.shape[-1] + 1) // 2):
-        #     print(i, xkc.shape[-1] - i)
-        #     xkc[..., i] = xkin[..., -i].conj()
-        # if mesh[2] % 2 == 1:
-        #     xkc[..., kmesh[2]:] = xkin2[..., -1:0:-1].conj()
-        # else:
-        #     xkc[..., kmesh[2]:] = xkin2[..., -2:0:-1].conj()
-        if False:
-            xkc[0, 0, 0] = xkc.real[0, 0, 0]
-            xkc[0, 0, -1] = xkc.real[0, 0, -1]
-            for ind in [0, -1]:
-                xkin_tmp = xkc[1:, 1:, ind][::-1, ::-1].conj().copy()
-                xkin2[1:, 1:, ind] += xkin_tmp
-                xkin2[1:, 1:, ind] *= 0.5
 
         xk_np = rfftn(xrin)
         xk_mkl = _call_mkl_test(xrin, True, mesh)
-        print(xk_np.shape, kmesh)
         assert (xk_np.shape == np.array(kmesh)).all()
         assert (xk_mkl.shape == np.array(kmesh)).all()
-        print(np.linalg.norm(xk_np - xk_mkl))
 
         xr2_np = ifftn(xkc.copy(), s=mesh, norm="forward")
         xr_np = irfftn(xkin.copy(), s=mesh, norm="forward")
         xr3_np = irfftn(xkin2.copy(), s=mesh, norm="forward")
         xr_mkl = _call_mkl_test(xkin, False, mesh)
         xr2_mkl = _call_mkl_test(xkin2, False, mesh)
-        print(xr_np.shape, mesh)
         assert (xr_np.shape == np.array(mesh)).all()
         assert (xr_mkl.shape == np.array(mesh)).all()
-        print(np.linalg.norm(xr_mkl), np.linalg.norm(xr_np - xr_mkl))
-        print(np.linalg.norm(xr2_mkl), np.linalg.norm(xr2_mkl - xr_mkl))
-        print(np.linalg.norm(xr3_np), np.linalg.norm(xr2_mkl - xr_mkl))
-        print("IMPORTANT", np.linalg.norm(xr2_np.imag), np.linalg.norm(xr2_np - xr3_np))
-        if np.prod(mesh) < 100:
-            print(xr_np)
-            print(xr_mkl)
+        assert_allclose(xr2_np.imag, 0, atol=1e-9)
+        assert_allclose(xr2_np, xr3_np, atol=1e-9)
+        assert_allclose(xr2_mkl, xr3_np, atol=1e-9)
+        assert_allclose(xr_mkl, xr_np, atol=1e-9)
 
 
 if __name__ == "__main__":
diff --git a/ciderpress/pyscf/pbc/tests/test_sdmx_fft.py b/ciderpress/pyscf/pbc/tests/test_sdmx_fft.py
index f5a105a..144e187 100644
--- a/ciderpress/pyscf/pbc/tests/test_sdmx_fft.py
+++ b/ciderpress/pyscf/pbc/tests/test_sdmx_fft.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import ctypes
 import time
 import unittest
@@ -50,9 +70,6 @@ def test_ao(self):
         pbcks = pbcdft.RKS(cell)
         pbcks.build()
         grids = pbcks.grids
-        print(cell.mesh)
-        print(pbcks.grids.coords.shape)
-        print(type(pbcks.grids))
 
         cond = np.linalg.norm(grids.coords - 0.5 * L * np.ones(3), axis=1) < 1.5
         ao, aok = get_ao_and_aok(cell, grids.coords, np.zeros(3), 0)
@@ -75,7 +92,6 @@ def test_ao(self):
             cell, np.zeros((1, 3)), alphas, alpha_norms, "gauss_diff"
         )
         for ialpha in range(alphas.size):
-            print(ws_convs[ialpha], alphas[ialpha])
             cao_test = convolve_aos(
                 cell,
                 aok,
@@ -119,7 +135,6 @@ def test_ao(self):
         for v in range(ncpa):
             for ialpha in range(nalpha):
                 tmp[v, ialpha] = _contract_rho(cao[ialpha * ncpa + v], c0)
-        print(p_ag.shape, tmp.shape, cond.shape)
         assert_allclose(p_ag[0][..., cond], tmp[0][..., cond], atol=1e-3, rtol=1e-4)
 
         p_ag = compute_sdmx_tensor_lowmem(
@@ -150,7 +165,6 @@ def test_ao(self):
         for v in range(ncpa):
             for ialpha in range(nalpha):
                 tmp[v, ialpha] = _contract_rho(cao[ialpha * ncpa + v], c0)
-        print(p_ag.shape, tmp.shape, cond.shape)
         assert_allclose(p_ag[0, :, cond], tmp[0, :, cond], atol=1e-3, rtol=1e-4)
         assert_allclose(p_ag[1, :, cond], tmp[1, :, cond], atol=1e-3, rtol=1e-4)
         assert_allclose(p_ag[2, :, cond], tmp[2, :, cond], atol=1e-3, rtol=1e-4)
@@ -168,7 +182,6 @@ def test_kpts(self):
         struct * [2, 2, 2]
         struct211 = struct * [2, 1, 1]
         struct.cell
-        print(struct)
         unitcell = pbcgto.Cell()
         unitcell.a = struct.cell
         unitcell.atom = pyscf_ase.ase_atoms_to_pyscf(struct)
@@ -187,7 +200,6 @@ def test_kpts(self):
         ks = pbcdft.KRKS(unitcell, kpts)
         ks.xc = "PBE"
         ks.kernel()
-        print(kpts)
 
         kpt = kpts.kpts[3]
         recip_cell = build_ft_cell(unitcell)
@@ -198,17 +210,12 @@ def test_kpts(self):
         aor_ref, aok_ref = get_ao_and_aok(unitcell, ks.grids.coords, kpt, deriv=0)
         norm = 1.0  # 4 * np.pi * np.prod(unitcell.mesh) / np.abs(np.linalg.det(unitcell.lattice_vectors()))
         # assert_allclose(norm * np.abs(aok_test[0]), np.abs(aok_ref.T), atol=1e-6, rtol=1e-6)
-        for i, j in zip(
-            (norm * aok_test[0, 0]).round(3).tolist(), aok_ref[:, 0].round(3).tolist()
-        ):
-            print(i, j)
         assert_allclose(norm * aok_test[0], aok_ref.T, atol=1e-6, rtol=1e-6)
         assert_allclose(aok_test[0], aok_ref.T, atol=1e-6, rtol=1e-6)
 
         assert aok_test[0].flags.f_contiguous
         t0 = time.monotonic()
         for mklpar in [0, 1]:
-            print(mklpar)
             _aok_test = aok_test[0].T.copy()
             assert _aok_test.flags.c_contiguous
             assert _aok_test.T.shape == aok_test[0].shape
@@ -244,7 +251,6 @@ def test_kpts(self):
         )
         for ind in [3, 4, 7]:
             for has_l1 in [True, False]:
-                print(ind, has_l1)
                 dm1p = [dm.copy() for dm in dm1]
                 delta = 1e-4
                 delta2 = 0.1
@@ -260,9 +266,7 @@ def test_kpts(self):
                     cutoff_type="ws",
                     has_l1=has_l1,
                 )
-                mysum = np.sqrt(p1_ag * p1_ag + delta2).sum()
                 dm1p[1][ind, ind] -= delta
-                t0 = time.monotonic()
                 p1_ag = compute_sdmx_tensor_lowmem(
                     dm1p,
                     unitcell,
@@ -274,10 +278,6 @@ def test_kpts(self):
                     cutoff_type="ws",
                     has_l1=has_l1,
                 )
-                t1 = time.monotonic()
-                print("FWD TIME", t1 - t0)
-                mysum2 = np.sqrt(p1_ag * p1_ag + delta2).sum()
-                tot = (mysum - mysum2) / delta
                 mo_coeff = kpts.transform_mo_coeff(ks.mo_coeff)
                 mo_occ = kpts.transform_mo_occ(ks.mo_occ)
                 p1mo_ag = compute_sdmx_tensor_mo(
@@ -294,7 +294,6 @@ def test_kpts(self):
                 )
                 vgrid = p1mo_ag / np.sqrt(delta2 + p1mo_ag * p1mo_ag)
                 vmats = [np.zeros_like(dm) for dm in dm1]
-                t0 = time.monotonic()
                 compute_sdmx_tensor_vxc(
                     vmats,
                     unitcell,
@@ -306,9 +305,6 @@ def test_kpts(self):
                     cutoff_type="ws",
                     has_l1=has_l1,
                 )
-                t1 = time.monotonic()
-                print("VXC time", t1 - t0)
-                print(mysum, tot, vmats[1][ind, ind])
 
         supercell = pbcgto.Cell()
         supercell.a = struct211.cell
@@ -337,7 +333,6 @@ def test_kpts(self):
 
         alphas = [1.0]
         alpha_norms = [1.0]
-        print("SHAPES", ks.grids.coords.shape, unitcell.mesh, type(ks.grids))
         p1_ag = compute_sdmx_tensor_lowmem(
             dm1,
             unitcell,
@@ -362,10 +357,6 @@ def test_kpts(self):
         )
         ngrids = ks.grids.coords.shape[0]
         tol = 1e-3
-        dv = np.abs(np.linalg.det(unitcell.lattice_vectors())) / ngrids
-        print(
-            p1_ag.sum() * dv, p2_ag[:, :ngrids].sum() * dv, p2_ag[:, ngrids:].sum() * dv
-        )
         assert_allclose(p1_ag, p2_ag[:, :, :ngrids], atol=tol, rtol=tol)
         assert_allclose(p1_ag, p2_ag[:, :, ngrids:], atol=tol, rtol=tol)
 
@@ -425,14 +416,7 @@ def test_kpts(self):
         t1 = time.monotonic()
         print("VXC time", t1 - t0)
         tol = 1e-3
-        dv = np.abs(np.linalg.det(unitcell.lattice_vectors())) / ngrids
         for v in range(4):
-            print(v)
-            print(
-                np.abs(p1_ag[v]).sum() * dv,
-                np.abs(p2_ag[v, :, :ngrids]).sum() * dv,
-                np.abs(p2_ag[v, :, ngrids:]).sum() * dv,
-            )
             assert_allclose(p1_ag[v], p2_ag[v, :, :ngrids], atol=tol, rtol=tol)
             assert_allclose(p1_ag[v], p2_ag[v, :, ngrids:], atol=tol, rtol=tol)
             assert_allclose(p2mo_ag[v], p2_ag[v], atol=tol, rtol=tol)
@@ -446,16 +430,8 @@ def test_full_dft_run(self):
         pseudo = "gth-pade"
         from ase.build import bulk
 
-        # struct = bulk('C', cubic=True)
-        # kpt_grid = [2, 2, 2]
-        # struct = bulk('C', cubic=False)
-        # kpt_grid = [4, 4, 4]
         struct = bulk("MgO", crystalstructure="rocksalt", a=4.19)
-        # kpt_grid = [4, 4, 4]
         kpt_grid = [2, 2, 2]
-        # struct = bulk('MgO', crystalstructure='rocksalt', a=4.19, cubic=True)
-        # kpt_grid = [2, 2, 2]
-        print(struct.cell.volume)
         unitcell = pbcgto.Cell()
         unitcell.a = struct.cell
         unitcell.atom = pyscf_ase.ase_atoms_to_pyscf(struct)
@@ -475,18 +451,11 @@ def test_full_dft_run(self):
         ks.xc = "r2SCAN"
         mlfunc = "functionals/CIDER24Xe.yaml"
 
-        t0 = time.monotonic()
-        # ks.kernel()
-        t1 = time.monotonic()
-        # ks = None
-        t2 = time.monotonic()
         cider_ks = dft.make_cider_calc(
             ks, mlfunc, xmix=0.25, xkernel="GGA_X_PBE", ckernel="GGA_C_PBE"
         )
         ks = None
         cider_ks.kernel()
-        t3 = time.monotonic()
-        print(t1 - t0, t3 - t2)
 
     @unittest.skip("high-memory")
     def test_dense_xc(self):
diff --git a/ciderpress/pyscf/pbc/tests/test_util.py b/ciderpress/pyscf/pbc/tests/test_util.py
index 442ead5..9c288e1 100644
--- a/ciderpress/pyscf/pbc/tests/test_util.py
+++ b/ciderpress/pyscf/pbc/tests/test_util.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import numpy as np
diff --git a/ciderpress/pyscf/pbc/util.py b/ciderpress/pyscf/pbc/util.py
index 91045ff..4845297 100644
--- a/ciderpress/pyscf/pbc/util.py
+++ b/ciderpress/pyscf/pbc/util.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import ctypes
 
 import numpy as np
diff --git a/ciderpress/pyscf/rks_grad.py b/ciderpress/pyscf/rks_grad.py
index 4029fa5..f24dab3 100644
--- a/ciderpress/pyscf/rks_grad.py
+++ b/ciderpress/pyscf/rks_grad.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import numpy as np
 import pyscf.df.grad.rks as rks_grad_df
 from pyscf import lib
diff --git a/ciderpress/pyscf/sdmx.py b/ciderpress/pyscf/sdmx.py
index c3552c5..e0f6974 100644
--- a/ciderpress/pyscf/sdmx.py
+++ b/ciderpress/pyscf/sdmx.py
@@ -19,13 +19,12 @@
 #
 
 import ctypes
-import time
 
 import numpy as np
 from pyscf import lib
 from pyscf.dft.numint import _dot_ao_ao, _dot_ao_dm, _scale_ao, eval_ao
 from pyscf.gto.eval_gto import BLKSIZE, _get_intor_and_comp, make_screen_index
-from pyscf.gto.mole import ANG_OF, ATOM_OF, PTR_EXP
+from pyscf.gto.mole import ANG_OF, ATOM_OF, NCTR_OF, PTR_EXP
 
 from ciderpress.dft.plans import SADMPlan, SDMXFullPlan, SDMXIntPlan, SDMXPlan
 from ciderpress.dft.settings import SADMSettings, SDMXFullSettings, SDMXSettings
@@ -43,6 +42,27 @@ def _get_ylm_atom_loc(mol):
     return ylm_atom_loc
 
 
+def _get_nrf(mol):
+    """
+    Get the number of unique radial functions associated with a basis
+    """
+    return np.sum(mol._bas[:, NCTR_OF])
+
+
+def _get_rf_loc(mol):
+    """
+    Get an integer numpy array with that gives the location of the
+    unique radial functions corresponding to each shell.
+    So rf_loc[shl : shl + 1] is the range of radial functions contained
+    in the shell. This is similar to ao_loc, except that it does not
+    include spherical harmonics, so each shell has a factor of 2l+1
+    more functions in ao_loc than rf_loc.
+    """
+    rf_loc = mol._bas[:, NCTR_OF]
+    rf_loc = np.append([0], np.cumsum(rf_loc)).astype(np.int32)
+    return rf_loc
+
+
 def eval_conv_shells(
     feval,
     plan,
@@ -98,6 +118,9 @@ def eval_conv_shells(
     nbas = bas.shape[0]
     ngrids = coords.shape[0]
 
+    rf_loc = _get_rf_loc(mol)
+    nrf = rf_loc[-1]
+
     if shls_slice is None:
         shls_slice = (0, nbas)
     if "spinor" in eval_name:
@@ -105,14 +128,13 @@ def eval_conv_shells(
         # ao = np.ndarray((2, comp, nao, ngrids), dtype=np.complex128,
         #                 buffer=out).transpose(0, 1, 3, 2)
     else:
-        ao = np.ndarray((comp, nbas, ngrids), buffer=out).transpose(0, 2, 1)
+        ao = np.ndarray((comp, nrf, ngrids), buffer=out).transpose(0, 2, 1)
 
     if non0tab is None:
         if cutoff is None:
             non0tab = np.ones(((ngrids + BLKSIZE - 1) // BLKSIZE, nbas), dtype=np.uint8)
         else:
             non0tab = make_screen_index(mol, coords, shls_slice, cutoff)
-    time.monotonic()
     drv(
         eval_fn,
         contract_fn,
@@ -120,6 +142,7 @@ def eval_conv_shells(
         ctypes.c_int(ngrids),
         (ctypes.c_int * 2)(1, ncpa),
         (ctypes.c_int * 2)(*shls_slice),
+        rf_loc.ctypes.data_as(ctypes.c_void_p),
         ao.ctypes.data_as(ctypes.c_void_p),
         coords.ctypes.data_as(ctypes.c_void_p),
         non0tab.ctypes.data_as(ctypes.c_void_p),
@@ -132,8 +155,6 @@ def eval_conv_shells(
         alpha_norms.ctypes.data_as(ctypes.c_void_p),
         ctypes.c_int(alphas.size),
     )
-    time.monotonic()
-    # print("Get shells", t1 - t0)
     if comp == 1:
         if "spinor" in eval_name:
             ao = ao[:, 0]
@@ -248,8 +269,8 @@ def deriv(self):
 
     def get_extra_ao(self, mol):
         ymem = _get_ylm_atom_loc(mol)[-1]
-        cmem = (1 + 6 * self.deriv) * mol.nbas
-        bmem = (1 + self.deriv) * mol.nbas * self.plan.nalpha
+        cmem = (1 + 6 * self.deriv) * _get_nrf(mol)
+        bmem = (1 + self.deriv) * _get_nrf(mol) * self.plan.nalpha
         return ymem + cmem + bmem
 
     def get_ao(self, mol, coords, non0tab=None, cutoff=None, save_buf=True):
@@ -257,7 +278,7 @@ def get_ao(self, mol, coords, non0tab=None, cutoff=None, save_buf=True):
         return eval_ao(mol, coords, deriv=0, non0tab=non0tab, cutoff=cutoff, out=None)
 
     def get_cao(self, mol, coords, non0tab=None, cutoff=None, save_buf=True):
-        caobuf_size = mol.nbas * coords.shape[0] * self.plan.nalpha
+        caobuf_size = _get_nrf(mol) * coords.shape[0] * self.plan.nalpha
         if self.deriv > 0:
             caobuf_size *= 2
         if self._cao_buf is not None and self._cao_buf.size >= caobuf_size:
@@ -290,6 +311,7 @@ def _contract_ao_to_bas_single_(
         atom_coords=None,
         bwd=False,
     ):
+        rf_loc = _get_rf_loc(mol)
         args = [
             ctypes.c_int(b0.shape[-1]),
             b0.ctypes.data_as(ctypes.c_void_p),
@@ -303,6 +325,8 @@ def _contract_ao_to_bas_single_(
             mol._bas.ctypes.data_as(ctypes.c_void_p),
             ctypes.c_int(mol.nbas),
             mol._env.ctypes.data_as(ctypes.c_void_p),
+            ctypes.c_int(rf_loc[-1]),
+            rf_loc.ctypes.data_as(ctypes.c_void_p),
         ]
         if coords is not None:
             assert coords.ndim == 1 and coords.flags.c_contiguous
@@ -372,6 +396,7 @@ def _contract_ao_to_bas_helper(
         ylm_atom_loc = _get_ylm_atom_loc(mol)
         if self.deriv == 1:
             atom_coords = np.asfortranarray(mol.atom_coords(unit="Bohr"))
+            rf_loc = _get_rf_loc(mol)
             args = [
                 ctypes.c_int(b0.shape[-1]),
                 b0.ctypes.data_as(ctypes.c_void_p),
@@ -387,6 +412,8 @@ def _contract_ao_to_bas_helper(
                 mol._env.ctypes.data_as(ctypes.c_void_p),
                 coords.ctypes.data_as(ctypes.c_void_p),
                 atom_coords.ctypes.data_as(ctypes.c_void_p),
+                ctypes.c_int(rf_loc[-1]),
+                rf_loc.ctypes.data_as(ctypes.c_void_p),
             ]
             if bwd:
                 fn = libcider.SDMXcontract_ao_to_bas_l1_bwd
@@ -405,41 +432,10 @@ def _contract_ao_to_bas_helper(
                 coords=None,
                 bwd=bwd,
             )
-        """
-        if self.deriv > 0:
-            atom_coords = np.asfortranarray(mol.atom_coords(unit='Bohr'))
-            self._contract_ao_to_bas_single_(
-                mol, b0[1], ylm[1], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=None, bwd=bwd
-            )
-            self._contract_ao_to_bas_single_(
-                mol, b0[2], ylm[2], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=None, bwd=bwd
-            )
-            self._contract_ao_to_bas_single_(
-                mol, b0[3], ylm[3], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=None, bwd=bwd
-            )
-            self._contract_ao_to_bas_single_(
-                mol, b0[4], ylm[0], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=coords[:, 0],
-                atom_coords=atom_coords[:, 0], bwd=bwd
-            )
-            self._contract_ao_to_bas_single_(
-                mol, b0[5], ylm[0], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=coords[:, 1],
-                atom_coords=atom_coords[:, 1], bwd=bwd
-            )
-            self._contract_ao_to_bas_single_(
-                mol, b0[6], ylm[0], c0, shls_slice, ao_loc,
-                ylm_atom_loc, coords=coords[:, 2],
-                atom_coords=atom_coords[:, 2], bwd=bwd
-            )
-        """
 
     def _contract_ao_to_bas(self, mol, c0, shls_slice, ao_loc, coords, ylm=None):
         ngrids = coords.shape[0]
-        b0 = np.empty((1 + 6 * self.deriv, mol.nbas, ngrids))
+        b0 = np.empty((1 + 6 * self.deriv, _get_nrf(mol), ngrids))
         self._contract_ao_to_bas_helper(
             mol,
             b0,
@@ -472,7 +468,7 @@ def _eval_crho_potential(
     ):
         ngrids = coords.shape[0]
         nalpha = self.plan.nalpha
-        b0 = np.empty((1 + 6 * self.deriv, mol.nbas, ngrids)).transpose(0, 2, 1)
+        b0 = np.empty((1 + 6 * self.deriv, _get_nrf(mol), ngrids)).transpose(0, 2, 1)
         if self.deriv == 0:
             b0[0] = _scale_ao(cao[:nalpha], tmp2[0])
         else:
@@ -532,7 +528,6 @@ def get_features(
             l0tmp2 = np.empty((n_out, n0, nalpha, ngrids))
             l1tmp2 = np.empty((n_out, nfeat - n0, 3, nalpha, ngrids))
         out = np.empty((n_out, nfeat, ngrids))
-        time.monotonic()
         if ao is None:
             ao = self.get_ao(
                 mol, coords, non0tab=non0tab, cutoff=cutoff, save_buf=save_buf
@@ -541,15 +536,12 @@ def get_features(
             cao = self.get_cao(
                 mol, coords, non0tab=non0tab, cutoff=cutoff, save_buf=save_buf
             )
-        time.monotonic()
         shls_slice = (0, mol.nbas)
         ao_loc = mol.ao_loc_nr()
         ylm = self._get_ylm(mol, coords, savebuf=True)
         for idm, dm in enumerate(dms):
             c0 = _dot_ao_dm(mol, ao, dm, None, shls_slice, ao_loc)
-            time.monotonic()
             b0 = self._contract_ao_to_bas(mol, c0, shls_slice, ao_loc, coords, ylm=ylm)
-            time.monotonic()
             if has_l1:
                 assert tmp.flags.c_contiguous
                 assert b0.flags.c_contiguous
@@ -566,14 +558,9 @@ def get_features(
                     tmp[0, ialpha] = lib.einsum(
                         "bg,bg->g", b0[0], cao[0 * nalpha + ialpha].T
                     )
-            time.monotonic()
             self.plan.get_features(
                 tmp, out=out[idm], l0tmp=l0tmp2[idm], l1tmp=l1tmp2[idm]
             )
-            time.monotonic()
-        # print('times', t1 - t0, t2 - t1, t3 - t2, torb2 - torb)
-        # print(out.mean(axis=-1))
-        # exit()
         if ndim == 2:
             out = out[0]
         self._cached_ao_data = (mol, ao, cao, l0tmp2, l1tmp2, coords, ylm)
diff --git a/ciderpress/pyscf/sdmx_slow.py b/ciderpress/pyscf/sdmx_slow.py
index fe8a6b4..e79208a 100644
--- a/ciderpress/pyscf/sdmx_slow.py
+++ b/ciderpress/pyscf/sdmx_slow.py
@@ -19,7 +19,6 @@
 #
 
 import ctypes
-import time
 
 import numpy as np
 from pyscf.dft.numint import _contract_rho, _dot_ao_ao, _dot_ao_dm, _scale_ao, eval_ao
@@ -124,13 +123,11 @@ def eval_conv_gto(
 
     # normal call tree is GTOval_sph_deriv0 -> GTOval_sph -> GTOeval_sph_drv
     drv = getattr(libcgto, "GTOeval_sph_drv")
-    tt = 0
     for i, alpha in enumerate(alphas):
         libcider.set_global_convolution_exponent(
             ctypes.c_double(alpha),
             ctypes.c_double(alpha_norms[i]),
         )
-        t0 = time.monotonic()
         drv(
             eval_fn,
             contract_fn,
@@ -148,13 +145,11 @@ def eval_conv_gto(
             ctypes.c_int(nbas),
             env.ctypes.data_as(ctypes.c_void_p),
         )
-        tt += time.monotonic() - t0
     if comp == 1:
         if "spinor" in eval_name:
             ao = ao[:, 0]
         else:
             ao = ao[0]
-    print("orb time", tt)
     return ao
 
 
@@ -232,7 +227,6 @@ def eval_conv_gto_fast(
             gaunt_lmax = max(lmax - 1, gaunt_lmax)
     ylm = np.ndarray((cpa, ylm_atom_loc[-1], ngrids), buffer=ylm_buf)
     atom_coords = np.ascontiguousarray(mol.atom_coords(unit="Bohr"))
-    t0 = time.monotonic()
     libcider.SDMXylm_loop(
         ctypes.c_int(coords.shape[0]),
         ylm.ctypes.data_as(ctypes.c_void_p),
@@ -258,8 +252,6 @@ def eval_conv_gto_fast(
         ylm_atom_loc.ctypes.data_as(ctypes.c_void_p),
         ctypes.c_int(mol.natm),
     )
-    t1 = time.monotonic()
-    print("ylm time", t1 - t0)
 
     if ao_loc is None:
         ao_loc = make_loc(bas, eval_name)
@@ -281,8 +273,6 @@ def eval_conv_gto_fast(
         else:
             non0tab = make_screen_index(mol, coords, shls_slice, cutoff)
 
-    tt = 0
-    t0 = time.monotonic()
     drv(
         iter_fn,
         eval_fn,
@@ -306,13 +296,11 @@ def eval_conv_gto_fast(
         alpha_norms.ctypes.data_as(ctypes.c_void_p),
         ctypes.c_int(alphas.size),
     )
-    tt += time.monotonic() - t0
     if comp == 1:
         if "spinor" in eval_name:
             ao = ao[:, 0]
         else:
             ao = ao[0]
-    print("orb time", tt)
     return ao
 
 
@@ -452,8 +440,6 @@ def from_settings_and_mol(
         if nalpha is None:
             max_exp = np.max(mol._env[mol._bas[:, PTR_EXP]])
             nalpha = 1 + int(1 + np.log(max_exp / alpha0) / np.log(lambd))
-            # nalpha += 2 # buffer exponents
-        print("NALPHA", nalpha)
         if isinstance(settings, SDMXFullSettings):
             plan = SDMXFullPlan(settings, nspin, alpha0, lambd, nalpha)
         elif isinstance(settings, SDMXSettings):
diff --git a/ciderpress/pyscf/tests/old_test_nldf.py b/ciderpress/pyscf/tests/old_test_nldf.py
index 3e7d05c..3323f7b 100644
--- a/ciderpress/pyscf/tests/old_test_nldf.py
+++ b/ciderpress/pyscf/tests/old_test_nldf.py
@@ -322,7 +322,6 @@ def _check_nldf_occ_derivs(
         for k, orblist in orbs.items():
             for iorb in orblist:
                 dtmp = {k: [iorb]}
-                print(k, iorb)
                 labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
                     dtmp, mo_coeff, mo_occ, mo_energy
                 )
@@ -686,17 +685,12 @@ def get_e_and_v(feat):
                 rho_pert = np.ascontiguousarray(rho_pert[None, [0, 1, 2, 3, -1]])
             else:
                 rho_pert = np.ascontiguousarray(rho_pert[:, [0, 1, 2, 3, -1]])
-            # feat_pert = get_descriptors(
-            #    ana_tmp, settings,
-            #    plan_type=plan_type, aux_lambd=lambd, aug_beta=beta,
-            # )
             feat_pert = []
             for s in range(nspin):
                 feat_pert.append(nldf_pert.get_features(rho_in=rho_pert[s], spin=s))
             feat_pert = np.stack(feat_pert)
             e_pert = get_e_and_v(feat_pert)[0]
             de2 = (e_pert - e) / delta
-            print("energies", e, de, de2, de3)
             de_tot = 0
             de2_tot = 0
             for i in range(feat2.shape[1]):
@@ -705,7 +699,6 @@ def get_e_and_v(feat):
                 feat_tmp[:, i + 1 :] = 0
                 vrho1 = []
                 _e, vrho_tmp, vfeat = get_e_and_v(feat_tmp)
-                _de3 = np.sum(occd_feat * vfeat[spin])
                 for s in range(nspin):
                     vrho1.append(nldf_gen.get_potential(vfeat[s], spin=s))
                 vrho1 = np.stack(vrho1)
@@ -717,11 +710,8 @@ def get_e_and_v(feat):
                 _de = np.sum(vrho1[spin] * occd_rho)
                 de_tot += _de
                 de2_tot += _de2
-                print("energies sub", i, _e, _de, _de2, _de3)
-            print("energies tot", de_tot, de2_tot)
             assert_allclose(de, de2, rtol=rtol, atol=1e-5)
             assert_allclose(de3, de, rtol=rtol, atol=1e-5)
-            print()
 
     def test_nldf_equivalence(self):
         mols = self.mols
diff --git a/ciderpress/pyscf/tests/test_frac_lapl.py b/ciderpress/pyscf/tests/test_frac_lapl.py
index f52c860..e274da9 100644
--- a/ciderpress/pyscf/tests/test_frac_lapl.py
+++ b/ciderpress/pyscf/tests/test_frac_lapl.py
@@ -19,7 +19,6 @@
 #
 
 import ctypes
-import time
 import unittest
 
 import numpy as np
@@ -266,7 +265,6 @@ def _check_vxc(self, mol, v2=False):
             em += np.dot(edens, weight).sum()
         egrad_pred = (dm_pert * vmat).sum()
         egrad_ref = (ep - em) / DELTA
-        print(egrad_pred, egrad_ref)
         assert_allclose(egrad_pred, egrad_ref, rtol=1e-7, atol=1e-10)
 
     def test_vxc(self):
@@ -398,7 +396,6 @@ def _check_occ_derivs(self, analyzer, coords, rtol=1e-4, atol=1e-4, v2=False):
         for k, orblist in orbs.items():
             for iorb in orblist:
                 dtmp = {k: [iorb]}
-                print(k, iorb)
                 labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
                     dtmp, mo_coeff, mo_occ, mo_energy
                 )
@@ -466,9 +463,7 @@ def _check_ueg(self, rho, v2=False):
                 get_flapl(grids, grids_cen, ueg_dm, rho * np.ones(1), s=s)[0]
             )
         ueg_preds = settings.ueg_vector(rho)
-        print(rho)
         # exclude s=0.5 because it is less stable
-        print(ueg_preds[:3], ueg_vals_num[:3])
         assert_allclose(ueg_preds[:3], ueg_vals_num[:3], rtol=1e-2, atol=3e-3)
         assert_allclose(ueg_preds[3], ueg_vals_num[3], rtol=3e-2, atol=1e-2)
         assert_allclose(ueg_preds[4:], 0, rtol=0, atol=0)
@@ -497,15 +492,11 @@ def test_eval_flapl_gto(self):
             ),
         )
 
-        t0 = time.monotonic()
         ao_ref = eval_ao(mol, coords=grids.coords)
         ao_ref_cen = eval_ao(mol, coords=grids_cen.coords)
-        t1 = time.monotonic()
         ao_pred = eval_flapl_gto(settings.slist, mol, coords=grids.coords, debug=True)
-        t2 = time.monotonic()
         for i in range(ao_pred.shape[0]):
             assert_allclose(ao_ref, ao_pred[i], rtol=1e-12, atol=1e-12)
-        t3 = time.monotonic()
         kao_pred = eval_flapl_gto(settings.slist, mol, coords=grids.coords, debug=False)
         kao_pred_cen = eval_flapl_gto(
             settings.slist, mol, coords=grids_cen.coords, debug=False
@@ -524,8 +515,6 @@ def test_eval_flapl_gto(self):
                 assert_allclose(
                     k1ao_pred[1 + 4 * j + i], kfd_ref[j], rtol=1e-5, atol=1e-6
                 )
-        t4 = time.monotonic()
-        print(t1 - t0, t2 - t1, t4 - t3)
 
         assert kao_pred.shape == ao_pred.shape
         for j in range(mol.nao_nr()):
@@ -557,7 +546,6 @@ def test_eval_flapl_gto(self):
         dm = ks.make_rdm1()
         shls_slice = (0, mol.nbas)
         ao_loc = mol.ao_loc_nr()
-        print(ao_ref.shape)
         dm_gu = _dot_ao_dm(mol, ao_ref, dm, None, shls_slice, ao_loc)
         dm_ru = _dot_ao_dm(mol, ao_ref_cen, dm, None, shls_slice, ao_loc)
         dm_rg = np.einsum("ru,gu->rg", ao_ref_cen, dm_gu)
diff --git a/ciderpress/pyscf/tests/test_nldf.py b/ciderpress/pyscf/tests/test_nldf.py
index c928871..c18fe09 100644
--- a/ciderpress/pyscf/tests/test_nldf.py
+++ b/ciderpress/pyscf/tests/test_nldf.py
@@ -116,7 +116,7 @@ def setUpClass(cls):
         }
         cls.mols = [
             gto.M(atom="H 0 0 0; F 0 0 0.9", basis="def2-tzvp", spin=0),
-            gto.M(atom="O 0 0 0; O 0 0 1.2", basis="def2-tzvp", spin=2),
+            gto.M(atom="O 0 0 0; O 0 1.2 0", basis="def2-tzvp", spin=2),
             gto.M(atom="Ar", basis="def2-tzvp", spin=0),
             gto.M(atom=nh3str, spin=0, basis="def2-svp"),
         ]
@@ -176,9 +176,7 @@ def _check_nldf_equivalence(
             aux_lambd=lambd,
             aug_beta=beta,
         )
-        for i in range(ifeat_pred.shape[1]):
-            print(i)
-            assert_allclose(ifeat_pred[:, i], ifeat[:, i], rtol=rtol, atol=atol)
+        assert_allclose(ifeat_pred, ifeat, rtol=rtol, atol=atol)
         jfeat_pred = get_descriptors(
             analyzer,
             self.vj_settings,
@@ -370,7 +368,6 @@ def _check_nldf_occ_derivs(self, analyzer, coords, rtol=1e-3, atol=1e-3):
         for k, orblist in orbs.items():
             for iorb in orblist:
                 dtmp = {k: [iorb]}
-                print(k, iorb)
                 labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
                     dtmp, mo_coeff, mo_occ, mo_energy
                 )
@@ -592,7 +589,6 @@ def _check_nldf_vxc(
         nspin = 1 if analyzer.dm.ndim == 2 else 2
 
         for settings in all_settings:
-            print(settings)
             feat_ref, occd_feat, eigvals = get_descriptors(
                 analyzer,
                 settings,
@@ -714,7 +710,6 @@ def get_e_and_v(feat, use_mean=False):
                 feat_pert = np.stack(feat_pert)
                 e_pert = get_e_and_v(feat_pert, use_mean=False)[0]
                 de2 = (e_pert - e) / delta
-                print("energies", e, de, de2, de3, (de - de2) / de)
                 de_tot = 0
                 de2_tot = 0
                 for i in range(feat2.shape[1]):
@@ -734,11 +729,9 @@ def get_e_and_v(feat, use_mean=False):
                     _de = np.sum(vrho1[spin] * occd_rho)
                     de_tot += _de
                     de2_tot += _de2
-                print("energies tot", de_tot, de2_tot, (de_tot - de2_tot) / de_tot)
                 assert_allclose(de, de2, rtol=rtol, atol=atol)
                 assert_allclose(de_tot, de2_tot, rtol=2 * rtol, atol=2 * atol)
                 assert_allclose(de3, de, rtol=1e-7, atol=1e-7)
-                print()
 
     def test_nldf_equivalence(self):
         mols = self.mols
diff --git a/ciderpress/pyscf/tests/test_rks_grad.py b/ciderpress/pyscf/tests/test_rks_grad.py
index 8d5dbd6..48adf03 100644
--- a/ciderpress/pyscf/tests/test_rks_grad.py
+++ b/ciderpress/pyscf/tests/test_rks_grad.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import yaml
@@ -15,6 +35,11 @@
     "xmix": 0.25,
 }
 
+NULL_SETTINGS = {
+    "xkernel": "GGA_X_PBE",
+    "ckernel": "GGA_C_PBE",
+    "xmix": 0.00,
+}
 
 XC_SETTINGS = {
     "xkernel": None,
@@ -52,7 +77,7 @@ def build_ks_calc(mol, mlfunc, df=False, alt_settings=None):
 
 
 def setUpModule():
-    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
     mlfuncs = [
         "functionals/{}.yaml".format(fname)
         for fname in [
@@ -82,28 +107,39 @@ def setUpModule():
     mf4.kernel()
     mf5 = build_ks_calc(mol, mlfuncs[3], df=False).density_fit()
     mf5.kernel()
+    mf6 = build_ks_calc(mol, mlfuncs[3], df=True, alt_settings=NULL_SETTINGS)
+    mf6.kernel()
+    mf7 = dft.RKS(mol).density_fit()
+    mf7.xc = "PBE"
+    mf7.grids.level = 1
+    mf7.conv_tol = CONV_TOL
+    mf7.kernel()
     if (
         not mf1.converged
         or not mf2.converged
         or not mf3.converged
         or not mf4.converged
         or not mf5.converged
+        or not mf6.converged
+        or not mf7.converged
     ):
         raise RuntimeError(
-            "{} {} {} {} {}".format(
+            "{} {} {} {} {} {} {}".format(
                 mf1.converged,
                 mf2.converged,
                 mf3.converged,
                 mf4.converged,
                 mf5.converged,
+                mf6.converged,
+                mf7.converged,
             )
         )
 
 
 def tearDownModule():
-    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
     mol.stdout.close()
-    del mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    del mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
 
 
 class KnownValues(unittest.TestCase):
@@ -162,6 +198,10 @@ def test_finite_difference_df(self):
         g2 = mf5.nuc_grad_method().set(grid_response=True).kernel()
         assert_almost_equal(g2, g, 9)
 
+        g3 = mf6.nuc_grad_method().set(grid_response=True).kernel()
+        g4 = mf7.nuc_grad_method().set(grid_response=True).kernel()
+        assert_almost_equal(g3, g4, 9)
+
     def _check_fd(self, functional):
         import os
 
diff --git a/ciderpress/pyscf/tests/test_sdmx.py b/ciderpress/pyscf/tests/test_sdmx.py
index 7075e88..866e4dc 100644
--- a/ciderpress/pyscf/tests/test_sdmx.py
+++ b/ciderpress/pyscf/tests/test_sdmx.py
@@ -29,7 +29,7 @@
 from pyscf.gto.mole import ANG_OF, ATOM_OF, NCTR_OF, NPRIM_OF, PTR_COEFF, PTR_EXP
 
 from ciderpress.dft.settings import SDMXG1Settings, SDMXGSettings
-from ciderpress.dft.sph_harm_coeff import get_deriv_ylm_coeff_v2
+from ciderpress.dft.sph_harm_coeff import get_deriv_ylm_coeff
 from ciderpress.lib import load_library as load_cider_library
 from ciderpress.pyscf.sdmx import eval_conv_sh
 from ciderpress.pyscf.sdmx_slow import (
@@ -115,7 +115,7 @@ def get_test_ylm(mol, lmax_list, coords, grad=False, xyz=False):
         ctypes.c_int(mol.natm),
     )
     if grad:
-        gaunt_coeff = get_deriv_ylm_coeff_v2(np.max(lmax_list))
+        gaunt_coeff = get_deriv_ylm_coeff(np.max(lmax_list))
         libcider.SDMXylm_grad(
             ctypes.c_int(coords.shape[0]),
             ylm.ctypes.data_as(ctypes.c_void_p),
@@ -135,16 +135,12 @@ def get_test_ylm(mol, lmax_list, coords, grad=False, xyz=False):
     return ylm
 
 
-class TestYlm(unittest.TestCase):
+class TestSDMX(unittest.TestCase):
     def test_ylm(self):
         for lmax_list in [[3, 4], [3, 0], [2, 1], [4, 4], [5, 6]]:
-            print(lmax_list)
             mol = gto.M(
                 atom="H 0 0 0; F 0 0 0.9", basis="def2-tzvp", output="/dev/null"
             )
-            # lmax_list = [3, 4]
-            # mol = gto.M(atom='He 0 0 0', basis='def2-tzvp')
-            # lmax_list = [3]
 
             grids = Grids(mol)
             grids.level = 0
@@ -189,9 +185,6 @@ def test_cao(self):
         for itype in ["gauss_r2", "gauss_diff"]:
             for basis in ["def2-svp", "def2-qzvppd", "cc-pvtz"]:
                 for deriv in [0, 1]:
-                    print(basis, deriv, itype)
-                    # if itype == 'gauss_r2' and deriv == 1:
-                    #     continue
                     mol = gto.M(
                         atom="H 0 0 0; F 0 0 0.9", basis=basis, output="/dev/null"
                     )
@@ -202,17 +195,13 @@ def test_cao(self):
                     plan = inits[deriv].initialize_sdmx_generator(mol, 1).plan
                     plan.settings._integral_type = itype
 
-                    t0 = time.monotonic()
                     cao_ref = eval_conv_ao_fast(
                         plan,
                         mol,
                         coords,
                         deriv=deriv,
                     )
-                    t1 = time.monotonic()
                     cao_test = eval_conv_ao(plan, mol, coords, deriv=deriv)
-                    t2 = time.monotonic()
-                    print(t1 - t0, t2 - t1)
                     assert_allclose(cao_test, cao_ref, atol=1e-8, rtol=1e-10)
 
                     mol.stdout.close()
@@ -226,7 +215,6 @@ def test_cao_speed(self):
         for itype in ["gauss_r2", "gauss_diff"]:
             for basis in ["def2-svp", "def2-qzvppd", "cc-pvtz"]:
                 for deriv in [0, 1]:
-                    print(basis, deriv, itype)
                     mol = gto.M(
                         atom="H 0 0 0; F 0 0 0.9", basis=basis, output="/dev/null"
                     )
@@ -242,7 +230,7 @@ def test_cao_speed(self):
                     ylm_buf = np.empty((25, blksize, nao))
                     t0 = time.monotonic()
                     for i0, i1 in lib.prange(0, grids.coords.shape[0], blksize):
-                        cao_ref = eval_conv_ao_fast(
+                        eval_conv_ao_fast(
                             plan,
                             mol,
                             grids.coords[i0:i1],
@@ -254,7 +242,7 @@ def test_cao_speed(self):
                     cao_buf = np.empty((ncomp, blksize, nao))
                     t2 = time.monotonic()
                     for i0, i1 in lib.prange(0, grids.coords.shape[0], blksize):
-                        cao_ref = eval_conv_ao(
+                        eval_conv_ao(
                             plan, mol, grids.coords[i0:i1], deriv=deriv, out=cao_buf
                         )
                     t3 = time.monotonic()
@@ -262,10 +250,9 @@ def test_cao_speed(self):
                     cao_buf = np.empty((ncomp, blksize, nao))
                     t4 = time.monotonic()
                     for i0, i1 in lib.prange(0, grids.coords.shape[0], blksize):
-                        cao_ref = eval_conv_sh(
+                        eval_conv_sh(
                             plan, mol, grids.coords[i0:i1], deriv=deriv, out=cao_buf
                         )
-                        print(np.isnan(cao_ref).any())
                     t5 = time.monotonic()
                     print(t1 - t0, t3 - t2, t5 - t4)
                     print()
@@ -286,9 +273,8 @@ def test_sdmx_module(self):
         init1 = NewSDMXInit(settings1)
         new_inits = {0: init0, 1: init1}
         for itype in ["gauss_diff"]:
-            for basis in ["def2-svp", "def2-tzvpd", "def2-qzvppd"]:
+            for basis in ["def2-svp", "def2-tzvpd", "def2-qzvppd", "aug-cc-pvtz"]:
                 for deriv in [0, 1]:
-                    print(basis, deriv, itype)
                     mol = gto.M(
                         atom="H 0 0 0; F 0 0 0.9", basis=basis, output="/dev/null"
                     )
@@ -307,22 +293,23 @@ def test_sdmx_module(self):
 
                     t0 = time.monotonic()
                     ref_feat = refgen.get_features(dm, mol, grids.coords, save_buf=True)
-                    ref_feat = refgen.get_features(dm, mol, grids.coords, save_buf=True)
-                    ref_feat = refgen.get_features(dm, mol, grids.coords, save_buf=True)
                     t1 = time.monotonic()
                     test_feat = testgen.get_features(
                         dm, mol, grids.coords, save_buf=True
                     )
-                    test_feat = testgen.get_features(
-                        dm, mol, grids.coords, save_buf=True
-                    )
-                    test_feat = testgen.get_features(
-                        dm, mol, grids.coords, save_buf=True
-                    )
                     t2 = time.monotonic()
                     print(t1 - t0, t2 - t1)
 
+                    np.random.seed(42)
+                    vxc_grid = np.random.normal(size=ref_feat.shape) ** 2
+                    nao = mol.nao_nr()
+                    vxc_mat_ref = np.zeros((nao, nao))
+                    vxc_mat_test = np.zeros((nao, nao))
+                    refgen.get_vxc_(vxc_mat_ref, vxc_grid)
+                    testgen.get_vxc_(vxc_mat_test, vxc_grid)
+
                     assert_allclose(test_feat, ref_feat, rtol=1e-8, atol=1e-10)
+                    assert_allclose(vxc_mat_test, vxc_mat_ref, rtol=1e-8, atol=1e-10)
 
                     mol.stdout.close()
 
diff --git a/ciderpress/pyscf/tests/test_sdmx_slow.py b/ciderpress/pyscf/tests/test_sdmx_slow.py
index 6e1fbe6..efe8561 100644
--- a/ciderpress/pyscf/tests/test_sdmx_slow.py
+++ b/ciderpress/pyscf/tests/test_sdmx_slow.py
@@ -362,7 +362,6 @@ def _check_ueg(self, rho):
         )
         exx_pred = -0.25 * np.einsum("xb,xb->x", tmp, tmp) * rho
         exx_ref = np.array(plan.settings.ueg_vector(rho)) / rho
-        print(rho)
         assert_allclose(
             exx_pred,
             exx_ref,
@@ -417,7 +416,6 @@ def _check_occ_derivs(self, analyzer, coords, rtol=1e-4, atol=1e-4):
         for k, orblist in orbs.items():
             for iorb in orblist:
                 dtmp = {k: [iorb]}
-                print(k, iorb)
                 labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
                     dtmp, mo_coeff, mo_occ, mo_energy
                 )
diff --git a/ciderpress/pyscf/tests/test_sl.py b/ciderpress/pyscf/tests/test_sl.py
index b6b7171..6e1e100 100644
--- a/ciderpress/pyscf/tests/test_sl.py
+++ b/ciderpress/pyscf/tests/test_sl.py
@@ -222,7 +222,6 @@ def _check_sl_occ_derivs(self, analyzer, coords, rtol=1e-3, atol=1e-3):
         for k, orblist in orbs.items():
             for iorb in orblist:
                 dtmp = {k: [iorb]}
-                print(k, iorb)
                 labels, coeffs, en_list, sep_spins = get_labels_and_coeffs(
                     dtmp, mo_coeff, mo_occ, mo_energy
                 )
diff --git a/ciderpress/pyscf/tests/test_uks_grad.py b/ciderpress/pyscf/tests/test_uks_grad.py
index 5ceb8fe..497bb84 100644
--- a/ciderpress/pyscf/tests/test_uks_grad.py
+++ b/ciderpress/pyscf/tests/test_uks_grad.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import unittest
 
 import yaml
@@ -15,6 +35,11 @@
     "xmix": 0.25,
 }
 
+NULL_SETTINGS = {
+    "xkernel": "GGA_X_PBE",
+    "ckernel": "GGA_C_PBE",
+    "xmix": 0.00,
+}
 
 XC_SETTINGS = {
     "xkernel": None,
@@ -49,7 +74,7 @@ def build_ks_calc(mol, mlfunc, df=False, alt_settings=None):
 
 
 def setUpModule():
-    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
     mlfuncs = [
         "functionals/{}.yaml".format(fname)
         for fname in [
@@ -57,7 +82,6 @@ def setUpModule():
             "CIDER23X_SL_MGGA",
             "CIDER23X_NL_GGA",
             "CIDER23X_NL_MGGA_DTR",
-            # "VB7_MAPPED",
         ]
     ]
     mol = gto.Mole()
@@ -82,28 +106,39 @@ def setUpModule():
     mf4.kernel()
     mf5 = build_ks_calc(mol, mlfuncs[3], df=False).density_fit()
     mf5.kernel()
+    mf6 = build_ks_calc(mol, mlfuncs[3], df=True, alt_settings=NULL_SETTINGS)
+    mf6.kernel()
+    mf7 = dft.UKS(mol).density_fit()
+    mf7.xc = "PBE"
+    mf7.grids.level = 1
+    mf7.conv_tol = CONV_TOL
+    mf7.kernel()
     if (
         not mf1.converged
         or not mf2.converged
         or not mf3.converged
         or not mf4.converged
         or not mf5.converged
+        or not mf6.converged
+        or not mf7.converged
     ):
         raise RuntimeError(
-            "{} {} {} {} {}".format(
+            "{} {} {} {} {} {} {}".format(
                 mf1.converged,
                 mf2.converged,
                 mf3.converged,
                 mf4.converged,
                 mf5.converged,
+                mf6.converged,
+                mf7.converged,
             )
         )
 
 
 def tearDownModule():
-    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    global mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
     mol.stdout.close()
-    del mol, mlfuncs, mf1, mf2, mf3, mf4, mf5
+    del mol, mlfuncs, mf1, mf2, mf3, mf4, mf5, mf6, mf7
 
 
 class KnownValues(unittest.TestCase):
@@ -131,7 +166,6 @@ def test_finite_difference(self):
         e2 = mf_scanner(
             mol1.set_geom_("O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587")
         )
-        # TODO precision for LDA is 6
         assert_almost_equal(g[0, 2], (e1 - e2) / 2e-4 * lib.param.BOHR, 7)
 
     def test_finite_difference_nl(self):
@@ -145,7 +179,6 @@ def test_finite_difference_nl(self):
         e2 = mf_scanner(
             mol1.set_geom_("O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587")
         )
-        # TODO precision for LDA is 6
         assert_almost_equal(g[0, 2], (e1 - e2) / 2e-4 * lib.param.BOHR, 7)
 
     def test_finite_difference_df(self):
@@ -164,6 +197,10 @@ def test_finite_difference_df(self):
         g2 = mf5.nuc_grad_method().set(grid_response=True).kernel()
         assert_almost_equal(g2, g, 9)
 
+        g3 = mf6.nuc_grad_method().set(grid_response=True).kernel()
+        g4 = mf7.nuc_grad_method().set(grid_response=True).kernel()
+        assert_almost_equal(g3, g4, 9)
+
     def _check_fd(self, functional):
         import os
 
@@ -183,7 +220,6 @@ def _check_fd(self, functional):
         e2 = mf_scanner(
             mol1.set_geom_("O  0. 0. -.0001; 1  0. -0.757 0.587; 1  0. 0.757 0.587")
         )
-        # TODO precision for LDA is 6
         assert_almost_equal(g[0, 2], (e1 - e2) / 2e-4 * lib.param.BOHR, 6)
 
     def test_vk_xc(self):
diff --git a/ciderpress/pyscf/uks_grad.py b/ciderpress/pyscf/uks_grad.py
index a28b3d4..d8ccd86 100644
--- a/ciderpress/pyscf/uks_grad.py
+++ b/ciderpress/pyscf/uks_grad.py
@@ -1,3 +1,23 @@
+#!/usr/bin/env python
+# CiderPress: Machine-learning based density functional theory calculations
+# Copyright (C) 2024 The President and Fellows of Harvard College
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>
+#
+# Author: Kyle Bystrom <kylebystrom@gmail.com>
+#
+
 import numpy as np
 from pyscf import lib
 from pyscf.df.grad import uks as uks_grad_df
@@ -81,7 +101,7 @@ def get_veff(ks_grad, mol=None, dm=None):
 
         return lib.tag_array(vxc, exc1_grid=exc)
     else:
-        # density fitting is used, TODO check this always works more thoroughly
+        # density fitting is used
         if not ni.libxc.is_hybrid_xc(mf.xc):
             vj = ks_grad.get_j(mol, dm)
             vxc += vj[0] + vj[1]