Fix Spherize – use SVD, simplify calculations (#320)

* #319 * Rewrite spherize * revert * black * skip test * comment * docs * Handle types * flag zero var * skip some conditions * typo * black * typo * checks * Handle low rank * fix size check * prompt user to report bug * change assertions to exceptions * Use return_numpy in Spherize * Fix typo * handle change of column names * add test * test centering * change assertions to exceptions, and explain better * clean up error messages, remove requirement for ZCA * Drop `create_bug_report_message` * Update error message * Math clarifications * Clarify why we don't use np.linalg.inv(S) * black missed this
cytomining · Jan 26, 2024 · 8541374 · 8541374
1 parent cf313bf
commit 8541374
Show file tree

Hide file tree

Showing 4 changed files with 266 additions and 106 deletions.
diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py
@@ -5,11 +5,7 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler, RobustScaler
 
-from pycytominer.cyto_utils import (
-    output,
-    infer_cp_features,
-    load_profiles,
-)
+from pycytominer.cyto_utils import output, infer_cp_features, load_profiles, util
 from pycytominer.operations import Spherize, RobustMAD
 
 
@@ -141,7 +137,10 @@ def normalize(
         scaler = RobustMAD(epsilon=mad_robustize_epsilon)
     elif method == "spherize":
         scaler = Spherize(
-            center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
+            center=spherize_center,
+            method=spherize_method,
+            epsilon=spherize_epsilon,
+            return_numpy=True,
         )
 
     if features == "infer":
@@ -161,15 +160,30 @@ def normalize(
         # Subset to only the features measured in the sample query
         fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])
 
-    # Scale the feature dataframe
+    fitted_scaled = fitted_scaler.transform(feature_df)
+
+    columns = fitted_scaler.columns if method == "spherize" else feature_df.columns
+
     feature_df = pd.DataFrame(
-        fitted_scaler.transform(feature_df),
-        columns=feature_df.columns,
+        fitted_scaled,
+        columns=columns,
         index=feature_df.index,
     )
 
     normalized = meta_df.merge(feature_df, left_index=True, right_index=True)
 
+    if feature_df.shape != profiles.loc[:, features].shape:
+        error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe"
+        context = f"the `{method}` method in `pycytominer.normalize`"
+        raise ValueError(f"{error_detail}. This is likely a bug in {context}")
+
+    if (normalized.shape[0] != profiles.shape[0]) or (
+        normalized.shape[1] != len(features) + len(meta_features)
+    ):
+        error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe"
+        context = f"the `{method}` method in `pycytominer.normalize`"
+        raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
+
     if output_file != None:
         output(
             df=normalized,

diff --git a/pycytominer/operations/transform.py b/pycytominer/operations/transform.py
@@ -11,6 +11,8 @@
 from scipy.linalg import eigh
 from scipy.stats import median_abs_deviation
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import StandardScaler
+from pycytominer.cyto_utils import util
 
 
 class Spherize(BaseEstimator, TransformerMixin):
@@ -33,7 +35,7 @@ class Spherize(BaseEstimator, TransformerMixin):
         a string indicating which class of sphering to perform
     """
 
-    def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
+    def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False):
         """
         Parameters
         ----------
@@ -43,17 +45,28 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
             option to center the input X matrix
         method : str, default "ZCA"
             a string indicating which class of sphering to perform
+        return_numpy: bool, default False
+            option to return ndarray, instead of dataframe
         """
         avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
 
         self.epsilon = epsilon
         self.center = center
+        self.return_numpy = return_numpy
 
-        assert (
-            method in avail_methods
-        ), f"Error {method} not supported. Select one of {avail_methods}"
+        if method not in avail_methods:
+            raise ValueError(
+                f"Error {method} not supported. Select one of {avail_methods}"
+            )
         self.method = method
 
+        # PCA-cor and ZCA-cor require center=True because we assumed we are
+        # only ever interested in computing centered Pearson correlation
+        # https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation
+
+        if self.method in ["PCA-cor", "ZCA-cor"] and not self.center:
+            raise ValueError("PCA-cor and ZCA-cor require center=True")
+
     def fit(self, X, y=None):
         """Identify the sphering transform given self.X
 
@@ -67,58 +80,147 @@ def fit(self, X, y=None):
         self
             With computed weights attribute
         """
-        # Get the mean of the features (columns) and center if specified
-        self.mu = X.mean()
-        if self.center:
-            X = X - self.mu
-
-        # Get the covariance matrix
-        C = (1 / X.shape[0]) * np.dot(X.transpose(), X)
-
-        if self.method in ["PCA", "ZCA"]:
-            # Get the eigenvalues and eigenvectors of the covariance matrix
-            s, U = eigh(C)
-
-            # Fix sign ambiguity of eigenvectors
-            U = pd.DataFrame(U * np.sign(np.diag(U)))
-
-            # Process the eigenvalues into a diagonal matrix and fix rounding errors
-            D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))
-
-            # Calculate the sphering matrix
-            self.W = np.dot(D, U.transpose())
-
-            # If ZCA, perform additional rotation
-            if self.method == "ZCA":
-                self.W = np.dot(U, self.W)
+        # Get Numpy representation of the DataFrame
+        X = X.values
 
         if self.method in ["PCA-cor", "ZCA-cor"]:
-            # Get the correlation matrix
-            R = np.corrcoef(X.transpose())
+            # The projection matrix for PCA-cor and ZCA-cor is the same as the
+            # projection matrix for PCA and ZCA, respectively, on the standardized
+            # data. So, we first standardize the data, then compute the projection
 
-            # Get the eigenvalues and eigenvectors of the correlation matrix
-            try:
-                t, G = eigh(R)
-            except ValueError:
+            self.standard_scaler = StandardScaler().fit(X)
+            variances = self.standard_scaler.var_
+            if np.any(variances == 0):
                 raise ValueError(
                     "Divide by zero error, make sure low variance columns are removed"
                 )
 
-            # Fix sign ambiguity of eigenvectors
-            G = pd.DataFrame(G * np.sign(np.diag(G)))
-
-            # Process the eigenvalues into a diagonal matrix and fix rounding errors
-            D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon)))
-
-            # process the covariance diagonal matrix and fix rounding errors
-            v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))
+            X = self.standard_scaler.transform(X)
+        else:
+            if self.center:
+                self.mean_centerer = StandardScaler(with_mean=True, with_std=False).fit(
+                    X
+                )
+                X = self.mean_centerer.transform(X)
+
+        # Get the number of observations and variables
+        n, d = X.shape
+
+        # Checking the rank of matrix X considering the number of samples (n) and features (d).
+        r = np.linalg.matrix_rank(X)
+
+        # Case 1: More features than samples (n < d).
+        # If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1.
+        # If not centered, the max rank is limited by n, as there can't be more independent vectors than samples.
+        # Case 2: More samples than features or equal (n >= d).
+        # Here, the max rank is limited by d (number of features), assuming each provides unique information.
+
+        if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))):
+            raise ValueError(
+                (
+                    "The data matrix X is not full rank: n = {n}, d = {d}, r = {r}."
+                    "Perfect linear dependencies are unusual in data matrices so something seems amiss."
+                    "Check for linear dependencies in the data and remove them."
+                )
+            )
+
+        # Get the eigenvalues and eigenvectors of the covariance matrix
+        # by computing the SVD on the data matrix
+        # https://stats.stackexchange.com/q/134282/8494
+        _, Sigma, Vt = np.linalg.svd(X, full_matrices=True)
+
+        # if n <= d then Sigma has shape (n,) so it will need to be expanded to
+        # d filled with the value r'th element of Sigma
+        if n <= d:
+            # Do some error checking
+            if Sigma.shape[0] != n:
+                error_detail = f"When n <= d, Sigma should have shape (n,) i.e. ({n}, ) but it is {Sigma.shape}"
+                context = (
+                    "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+                )
+                raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
 
-            # Calculate the sphering matrix
-            self.W = np.dot(np.dot(D, G.transpose()), v)
+            if (r != n - 1) & (r != n):
+                error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}"
+                context = (
+                    "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+                )
+                raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
+
+            Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r)))
+
+        Sigma = Sigma + self.epsilon
+
+        # From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is
+        #
+        # W = Lambda^(-1/2) E^T
+        #
+        # where
+        # - E is the eigenvector matrix
+        # - Lambda is the (diagonal) eigenvalue matrix
+        # of cov(X), where X is the data matrix
+        # (i.e., cov(X) = E Lambda E^T)
+        #
+        # However, W can also be computed using the Singular Value Decomposition
+        # (SVD) of X. Assuming X is mean-centered (zero mean for each feature),
+        # the SVD of X is given by:
+        #
+        # X = U S V^T
+        #
+        # The covariance matrix of X can be expressed using its SVD:
+        #
+        # cov(X) = (X^T X) / (n - 1)
+        #        = (V S^2 V^T) / (n - 1)
+        #
+        # By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify:
+        #
+        #   E = V (eigenvectors)
+        #   Lambda = S^2 / (n - 1) (eigenvalues)
+        #
+        # Thus, Lambda^(-1/2) can be expressed as:
+        #
+        #   Lambda^(-1/2) = inv(S) * sqrt(n - 1)
+        #
+        # Therefore, the PCA Whitening matrix W becomes:
+        #
+        # W = Lambda^(-1/2) * E^T
+        #   = (inv(S) * sqrt(n - 1)) * V^T
+        #   = (inv(S) * V^T) * sqrt(n - 1)
+        #
+        # In NumPy, this is implemented as:
+        #
+        #   W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1)
+        #
+        # But computing `np.linalg.inv(S)` is memory-intensive.
+        #
+        # A more memory-efficient alternative is:
+        #
+        #   W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1)
+        #
+        # where Sigma contains the diagonal elements of S (singular values).
+
+        self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1)
+
+        # If ZCA, perform additional rotation
+        if self.method in ["ZCA", "ZCA-cor"]:
+            # Note: There was previously a requirement r==d otherwise the
+            # ZCA transform would not be well defined. However, it later appeared
+            # that this requirement was not necessary.
+
+            self.W = self.W @ Vt
+
+        # number of columns of self.W should be equal to that of X
+        assert (
+            self.W.shape[1] == X.shape[1]
+        ), f"Error: W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
 
-            # If ZCA-cor, perform additional rotation
-            if self.method == "ZCA-cor":
-                self.W = np.dot(G, self.W)
+        if self.W.shape[1] != X.shape[1]:
+            error_detail = (
+                f"The number of columns of W should be equal to that of X."
+                f"However, W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
+            )
+            context = "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
+            raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
 
         return self
 
@@ -137,7 +239,29 @@ def transform(self, X, y=None):
         pandas.core.frame.DataFrame
             Spherized dataframe
         """
-        return np.dot(X - self.mu, self.W.transpose())
+
+        columns = X.columns
+
+        # Get Numpy representation of the DataFrame
+        X = X.values
+
+        if self.method in ["PCA-cor", "ZCA-cor"]:
+            X = self.standard_scaler.transform(X)
+        else:
+            if self.center:
+                X = self.mean_centerer.transform(X)
+
+        if self.method in ["PCA", "PCA-cor"]:
+            columns = ["PC" + str(i) for i in range(1, X.shape[1] + 1)]
+
+        self.columns = columns
+
+        XW = X @ self.W
+
+        if self.return_numpy:
+            return XW
+        else:
+            return pd.DataFrame(XW, columns=columns)
 
 
 class RobustMAD(BaseEstimator, TransformerMixin):