From 854137408838c627e24186e55f4c895969fb7327 Mon Sep 17 00:00:00 2001
From: Shantanu Singh <shsingh@broadinstitute.org>
Date: Fri, 26 Jan 2024 12:20:08 -0500
Subject: [PATCH] =?UTF-8?q?Fix=20Spherize=20=E2=80=93=C2=A0use=20SVD,=20si?=
 =?UTF-8?q?mplify=20calculations=20(#320)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* https://github.com/cytomining/pycytominer/issues/319

* Rewrite spherize

* revert

* black

* skip test

* comment

* docs

* Handle types

* flag zero var

* skip some conditions

* typo

* black

* typo

* checks

* Handle low rank

* fix size check

* prompt user to report bug

* change assertions to exceptions

* Use return_numpy in Spherize

* Fix typo

* handle change of column names

* add test

* test centering

* change assertions to exceptions, and explain better

* clean up error messages, remove requirement for ZCA

* Drop `create_bug_report_message`

* Update error message

* Math clarifications

* Clarify why we don't use np.linalg.inv(S)

* black missed this
---
 pycytominer/normalize.py                |  32 +++-
 pycytominer/operations/transform.py     | 220 ++++++++++++++++++------
 tests/test_normalize.py                 | 114 ++++++------
 tests/test_operations/test_transform.py |   6 +
 4 files changed, 266 insertions(+), 106 deletions(-)

diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py
index d0f8c9ff..2fbe705c 100644
--- a/pycytominer/normalize.py
+++ b/pycytominer/normalize.py
@@ -5,11 +5,7 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler, RobustScaler
 
-from pycytominer.cyto_utils import (
-    output,
-    infer_cp_features,
-    load_profiles,
-)
+from pycytominer.cyto_utils import output, infer_cp_features, load_profiles, util
 from pycytominer.operations import Spherize, RobustMAD
 
 
@@ -141,7 +137,10 @@ def normalize(
         scaler = RobustMAD(epsilon=mad_robustize_epsilon)
     elif method == "spherize":
         scaler = Spherize(
-            center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
+            center=spherize_center,
+            method=spherize_method,
+            epsilon=spherize_epsilon,
+            return_numpy=True,
         )
 
     if features == "infer":
@@ -161,15 +160,30 @@ def normalize(
         # Subset to only the features measured in the sample query
         fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])
 
-    # Scale the feature dataframe
+    fitted_scaled = fitted_scaler.transform(feature_df)
+
+    columns = fitted_scaler.columns if method == "spherize" else feature_df.columns
+
     feature_df = pd.DataFrame(
-        fitted_scaler.transform(feature_df),
-        columns=feature_df.columns,
+        fitted_scaled,
+        columns=columns,
         index=feature_df.index,
     )
 
     normalized = meta_df.merge(feature_df, left_index=True, right_index=True)
 
+    if feature_df.shape != profiles.loc[:, features].shape:
+        error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe"
+        context = f"the `{method}` method in `pycytominer.normalize`"
+        raise ValueError(f"{error_detail}. This is likely a bug in {context}")
+
+    if (normalized.shape[0] != profiles.shape[0]) or (
+        normalized.shape[1] != len(features) + len(meta_features)
+    ):
+        error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe"
+        context = f"the `{method}` method in `pycytominer.normalize`"
+        raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
+
     if output_file != None:
         output(
             df=normalized,
diff --git a/pycytominer/operations/transform.py b/pycytominer/operations/transform.py
index 1912e6b1..b912a527 100644
--- a/pycytominer/operations/transform.py
+++ b/pycytominer/operations/transform.py
@@ -11,6 +11,8 @@
 from scipy.linalg import eigh
 from scipy.stats import median_abs_deviation
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import StandardScaler
+from pycytominer.cyto_utils import util
 
 
 class Spherize(BaseEstimator, TransformerMixin):
@@ -33,7 +35,7 @@ class Spherize(BaseEstimator, TransformerMixin):
         a string indicating which class of sphering to perform
     """
 
-    def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
+    def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False):
         """
         Parameters
         ----------
@@ -43,17 +45,28 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
             option to center the input X matrix
         method : str, default "ZCA"
             a string indicating which class of sphering to perform
+        return_numpy: bool, default False
+            option to return ndarray, instead of dataframe
         """
         avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
 
         self.epsilon = epsilon
         self.center = center
+        self.return_numpy = return_numpy
 
-        assert (
-            method in avail_methods
-        ), f"Error {method} not supported. Select one of {avail_methods}"
+        if method not in avail_methods:
+            raise ValueError(
+                f"Error {method} not supported. Select one of {avail_methods}"
+            )
         self.method = method
 
+        # PCA-cor and ZCA-cor require center=True because we assumed we are
+        # only ever interested in computing centered Pearson correlation
+        # https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation
+
+        if self.method in ["PCA-cor", "ZCA-cor"] and not self.center:
+            raise ValueError("PCA-cor and ZCA-cor require center=True")
+
     def fit(self, X, y=None):
         """Identify the sphering transform given self.X
 
@@ -67,58 +80,147 @@ def fit(self, X, y=None):
         self
             With computed weights attribute
         """
-        # Get the mean of the features (columns) and center if specified
-        self.mu = X.mean()
-        if self.center:
-            X = X - self.mu
-
-        # Get the covariance matrix
-        C = (1 / X.shape[0]) * np.dot(X.transpose(), X)
-
-        if self.method in ["PCA", "ZCA"]:
-            # Get the eigenvalues and eigenvectors of the covariance matrix
-            s, U = eigh(C)
-
-            # Fix sign ambiguity of eigenvectors
-            U = pd.DataFrame(U * np.sign(np.diag(U)))
-
-            # Process the eigenvalues into a diagonal matrix and fix rounding errors
-            D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))
-
-            # Calculate the sphering matrix
-            self.W = np.dot(D, U.transpose())
-
-            # If ZCA, perform additional rotation
-            if self.method == "ZCA":
-                self.W = np.dot(U, self.W)
+        # Get Numpy representation of the DataFrame
+        X = X.values
 
         if self.method in ["PCA-cor", "ZCA-cor"]:
-            # Get the correlation matrix
-            R = np.corrcoef(X.transpose())
+            # The projection matrix for PCA-cor and ZCA-cor is the same as the
+            # projection matrix for PCA and ZCA, respectively, on the standardized
+            # data. So, we first standardize the data, then compute the projection
 
-            # Get the eigenvalues and eigenvectors of the correlation matrix
-            try:
-                t, G = eigh(R)
-            except ValueError:
+            self.standard_scaler = StandardScaler().fit(X)
+            variances = self.standard_scaler.var_
+            if np.any(variances == 0):
                 raise ValueError(
                     "Divide by zero error, make sure low variance columns are removed"
                 )
 
-            # Fix sign ambiguity of eigenvectors
-            G = pd.DataFrame(G * np.sign(np.diag(G)))
-
-            # Process the eigenvalues into a diagonal matrix and fix rounding errors
-            D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon)))
-
-            # process the covariance diagonal matrix and fix rounding errors
-            v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))
+            X = self.standard_scaler.transform(X)
+        else:
+            if self.center:
+                self.mean_centerer = StandardScaler(with_mean=True, with_std=False).fit(
+                    X
+                )
+                X = self.mean_centerer.transform(X)
+
+        # Get the number of observations and variables
+        n, d = X.shape
+
+        # Checking the rank of matrix X considering the number of samples (n) and features (d).
+        r = np.linalg.matrix_rank(X)
+
+        # Case 1: More features than samples (n < d).
+        # If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1.
+        # If not centered, the max rank is limited by n, as there can't be more independent vectors than samples.
+        # Case 2: More samples than features or equal (n >= d).
+        # Here, the max rank is limited by d (number of features), assuming each provides unique information.
+
+        if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))):
+            raise ValueError(
+                (
+                    "The data matrix X is not full rank: n = {n}, d = {d}, r = {r}."
+                    "Perfect linear dependencies are unusual in data matrices so something seems amiss."
+                    "Check for linear dependencies in the data and remove them."
+                )
+            )
+
+        # Get the eigenvalues and eigenvectors of the covariance matrix
+        # by computing the SVD on the data matrix
+        # https://stats.stackexchange.com/q/134282/8494
+        _, Sigma, Vt = np.linalg.svd(X, full_matrices=True)
+
+        # if n <= d then Sigma has shape (n,) so it will need to be expanded to
+        # d filled with the value r'th element of Sigma
+        if n <= d:
+            # Do some error checking
+            if Sigma.shape[0] != n:
+                error_detail = f"When n <= d, Sigma should have shape (n,) i.e. ({n}, ) but it is {Sigma.shape}"
+                context = (
+                    "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+                )
+                raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
 
-            # Calculate the sphering matrix
-            self.W = np.dot(np.dot(D, G.transpose()), v)
+            if (r != n - 1) & (r != n):
+                error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}"
+                context = (
+                    "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+                )
+                raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
+
+            Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r)))
+
+        Sigma = Sigma + self.epsilon
+
+        # From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is
+        #
+        # W = Lambda^(-1/2) E^T
+        #
+        # where
+        # - E is the eigenvector matrix
+        # - Lambda is the (diagonal) eigenvalue matrix
+        # of cov(X), where X is the data matrix
+        # (i.e., cov(X) = E Lambda E^T)
+        #
+        # However, W can also be computed using the Singular Value Decomposition
+        # (SVD) of X. Assuming X is mean-centered (zero mean for each feature),
+        # the SVD of X is given by:
+        #
+        # X = U S V^T
+        #
+        # The covariance matrix of X can be expressed using its SVD:
+        #
+        # cov(X) = (X^T X) / (n - 1)
+        #        = (V S^2 V^T) / (n - 1)
+        #
+        # By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify:
+        #
+        #   E = V (eigenvectors)
+        #   Lambda = S^2 / (n - 1) (eigenvalues)
+        #
+        # Thus, Lambda^(-1/2) can be expressed as:
+        #
+        #   Lambda^(-1/2) = inv(S) * sqrt(n - 1)
+        #
+        # Therefore, the PCA Whitening matrix W becomes:
+        #
+        # W = Lambda^(-1/2) * E^T
+        #   = (inv(S) * sqrt(n - 1)) * V^T
+        #   = (inv(S) * V^T) * sqrt(n - 1)
+        #
+        # In NumPy, this is implemented as:
+        #
+        #   W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1)
+        #
+        # But computing `np.linalg.inv(S)` is memory-intensive.
+        #
+        # A more memory-efficient alternative is:
+        #
+        #   W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1)
+        #
+        # where Sigma contains the diagonal elements of S (singular values).
+
+        self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1)
+
+        # If ZCA, perform additional rotation
+        if self.method in ["ZCA", "ZCA-cor"]:
+            # Note: There was previously a requirement r==d otherwise the
+            # ZCA transform would not be well defined. However, it later appeared
+            # that this requirement was not necessary.
+
+            self.W = self.W @ Vt
+
+        # number of columns of self.W should be equal to that of X
+        assert (
+            self.W.shape[1] == X.shape[1]
+        ), f"Error: W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
 
-            # If ZCA-cor, perform additional rotation
-            if self.method == "ZCA-cor":
-                self.W = np.dot(G, self.W)
+        if self.W.shape[1] != X.shape[1]:
+            error_detail = (
+                f"The number of columns of W should be equal to that of X."
+                f"However, W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
+            )
+            context = "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+            raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
 
         return self
 
@@ -137,7 +239,29 @@ def transform(self, X, y=None):
         pandas.core.frame.DataFrame
             Spherized dataframe
         """
-        return np.dot(X - self.mu, self.W.transpose())
+
+        columns = X.columns
+
+        # Get Numpy representation of the DataFrame
+        X = X.values
+
+        if self.method in ["PCA-cor", "ZCA-cor"]:
+            X = self.standard_scaler.transform(X)
+        else:
+            if self.center:
+                X = self.mean_centerer.transform(X)
+
+        if self.method in ["PCA", "PCA-cor"]:
+            columns = ["PC" + str(i) for i in range(1, X.shape[1] + 1)]
+
+        self.columns = columns
+
+        XW = X @ self.W
+
+        if self.return_numpy:
+            return XW
+        else:
+            return pd.DataFrame(XW, columns=columns)
 
 
 class RobustMAD(BaseEstimator, TransformerMixin):
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
index e787ead0..ffbb07d6 100644
--- a/tests/test_normalize.py
+++ b/tests/test_normalize.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 from pycytominer.normalize import normalize
+import pytest
 
 random.seed(123)
 
@@ -445,59 +446,74 @@ def test_normalize_standardize_allsamples_compress():
 def test_normalize_spherize():
     for spherize_method in ["ZCA", "PCA", "ZCA-cor", "PCA-cor"]:
         for spherize_center in [True, False]:
-            result = normalize(
-                data_spherize_df,
-                features=["a", "b", "c", "d"],
-                meta_features=["id"],
-                method="spherize",
-                spherize_method=spherize_method,
-                spherize_center=spherize_center,
-            )
-            result_cov = (
-                pd.DataFrame(np.cov(np.transpose(result.drop("id", axis="columns"))))
-                .round()
-                .sum()
-                .clip(1)
-                .sum()
-            )
-            expected_result = data_spherize_df.shape[1] - 1
-            assert result_cov == expected_result
-
-            result = normalize(
-                data_spherize_df,
-                samples="id == 'control'",
-                features=["a", "b", "c", "d"],
-                meta_features=["id"],
-                method="spherize",
-                spherize_method=spherize_method,
-                spherize_center=spherize_center,
-            )
-            result_cov = (
-                np.cov(
-                    np.transpose(
-                        result.query("id == 'control'").drop("id", axis="columns")
+            if spherize_method in ["PCA-cor", "ZCA-cor"] and not spherize_center:
+                # verify that calling normalize will throw a ValueError
+                with pytest.raises(ValueError):
+                    normalize(
+                        data_spherize_df,
+                        features=["a", "b", "c", "d"],
+                        meta_features=["id"],
+                        method="spherize",
+                        spherize_method=spherize_method,
+                        spherize_center=spherize_center,
                     )
+
+            else:
+                result = normalize(
+                    data_spherize_df,
+                    features=["a", "b", "c", "d"],
+                    meta_features=["id"],
+                    method="spherize",
+                    spherize_method=spherize_method,
+                    spherize_center=spherize_center,
+                )
+                result_cov = (
+                    pd.DataFrame(
+                        np.cov(np.transpose(result.drop("id", axis="columns")))
+                    )
+                    .round()
+                    .sum()
+                    .clip(1)
+                    .sum()
+                )
+                expected_result = data_spherize_df.shape[1] - 1
+                assert result_cov == expected_result
+
+                result = normalize(
+                    data_spherize_df,
+                    samples="id == 'control'",
+                    features=["a", "b", "c", "d"],
+                    meta_features=["id"],
+                    method="spherize",
+                    spherize_method=spherize_method,
+                    spherize_center=spherize_center,
+                )
+                result_cov = (
+                    np.cov(
+                        np.transpose(
+                            result.query("id == 'control'").drop("id", axis="columns")
+                        )
+                    )
+                    .round()
+                    .sum()
+                    .clip(1)
+                    .sum()
                 )
-                .round()
-                .sum()
-                .clip(1)
-                .sum()
-            )
-            # Add some tolerance to result b/c of low sample size
-            expected_result = data_spherize_df.shape[1]
-            assert result_cov < expected_result
-
-            non_spherize_result_cov = (
-                np.cov(
-                    np.transpose(
-                        result.query("id == 'treatment'").drop("id", axis="columns")
+                # Add some tolerance to result b/c of low sample size
+                expected_result = data_spherize_df.shape[1]
+                assert result_cov < expected_result
+
+                non_spherize_result_cov = (
+                    np.cov(
+                        np.transpose(
+                            result.query("id == 'treatment'").drop("id", axis="columns")
+                        )
                     )
+                    .round()
+                    .sum()
+                    .sum()
                 )
-                .round()
-                .sum()
-                .sum()
-            )
-            assert non_spherize_result_cov >= expected_result - 5
+                assert non_spherize_result_cov >= expected_result - 5
 
 
 def test_spherize_epsilon():
diff --git a/tests/test_operations/test_transform.py b/tests/test_operations/test_transform.py
index 25b914ff..4ee08ddc 100644
--- a/tests/test_operations/test_transform.py
+++ b/tests/test_operations/test_transform.py
@@ -22,6 +22,8 @@ def test_spherize():
     spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
     for method in spherize_methods:
         for center in [True, False]:
+            if ["PCA-cor", "ZCA-cor"] and not center:
+                continue
             scaler = Spherize(method=method, center=center)
             scaler = scaler.fit(data_df)
             transform_df = scaler.transform(data_df)
@@ -46,6 +48,8 @@ def test_low_variance_spherize():
     spherize_methods = ["PCA-cor", "ZCA-cor"]
     for method in spherize_methods:
         for center in [True, False]:
+            if method in ["PCA-cor", "ZCA-cor"] and not center:
+                continue
             scaler = Spherize(method=method, center=center)
             with pytest.raises(ValueError) as errorinfo:
                 scaler = scaler.fit(data_no_variance)
@@ -57,6 +61,8 @@ def test_spherize_precenter():
     data_precentered = data_df - data_df.mean()
     spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
     for method in spherize_methods:
+        if method in ["PCA-cor", "ZCA-cor"]:
+            continue
         scaler = Spherize(method=method, center=False)
         scaler = scaler.fit(data_precentered)
         transform_df = scaler.transform(data_df)