From 854137408838c627e24186e55f4c895969fb7327 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Fri, 26 Jan 2024 12:20:08 -0500 Subject: [PATCH] =?UTF-8?q?Fix=20Spherize=20=E2=80=93=C2=A0use=20SVD,=20si?= =?UTF-8?q?mplify=20calculations=20(#320)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * https://github.com/cytomining/pycytominer/issues/319 * Rewrite spherize * revert * black * skip test * comment * docs * Handle types * flag zero var * skip some conditions * typo * black * typo * checks * Handle low rank * fix size check * prompt user to report bug * change assertions to exceptions * Use return_numpy in Spherize * Fix typo * handle change of column names * add test * test centering * change assertions to exceptions, and explain better * clean up error messages, remove requirement for ZCA * Drop `create_bug_report_message` * Update error message * Math clarifications * Clarify why we don't use np.linalg.inv(S) * black missed this --- pycytominer/normalize.py | 32 +++- pycytominer/operations/transform.py | 220 ++++++++++++++++++------ tests/test_normalize.py | 114 ++++++------ tests/test_operations/test_transform.py | 6 + 4 files changed, 266 insertions(+), 106 deletions(-) diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py index d0f8c9ff..2fbe705c 100644 --- a/pycytominer/normalize.py +++ b/pycytominer/normalize.py @@ -5,11 +5,7 @@ import pandas as pd from sklearn.preprocessing import StandardScaler, RobustScaler -from pycytominer.cyto_utils import ( - output, - infer_cp_features, - load_profiles, -) +from pycytominer.cyto_utils import output, infer_cp_features, load_profiles, util from pycytominer.operations import Spherize, RobustMAD @@ -141,7 +137,10 @@ def normalize( scaler = RobustMAD(epsilon=mad_robustize_epsilon) elif method == "spherize": scaler = Spherize( - center=spherize_center, method=spherize_method, epsilon=spherize_epsilon + center=spherize_center, + method=spherize_method, + epsilon=spherize_epsilon, + return_numpy=True, ) if features == "infer": @@ -161,15 +160,30 @@ def normalize( # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) - # Scale the feature dataframe + fitted_scaled = fitted_scaler.transform(feature_df) + + columns = fitted_scaler.columns if method == "spherize" else feature_df.columns + feature_df = pd.DataFrame( - fitted_scaler.transform(feature_df), - columns=feature_df.columns, + fitted_scaled, + columns=columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) + if feature_df.shape != profiles.loc[:, features].shape: + error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe" + context = f"the `{method}` method in `pycytominer.normalize`" + raise ValueError(f"{error_detail}. This is likely a bug in {context}") + + if (normalized.shape[0] != profiles.shape[0]) or ( + normalized.shape[1] != len(features) + len(meta_features) + ): + error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe" + context = f"the `{method}` method in `pycytominer.normalize`" + raise ValueError(f"{error_detail}. This is likely a bug in {context}.") + if output_file != None: output( df=normalized, diff --git a/pycytominer/operations/transform.py b/pycytominer/operations/transform.py index 1912e6b1..b912a527 100644 --- a/pycytominer/operations/transform.py +++ b/pycytominer/operations/transform.py @@ -11,6 +11,8 @@ from scipy.linalg import eigh from scipy.stats import median_abs_deviation from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import StandardScaler +from pycytominer.cyto_utils import util class Spherize(BaseEstimator, TransformerMixin): @@ -33,7 +35,7 @@ class Spherize(BaseEstimator, TransformerMixin): a string indicating which class of sphering to perform """ - def __init__(self, epsilon=1e-6, center=True, method="ZCA"): + def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False): """ Parameters ---------- @@ -43,17 +45,28 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"): option to center the input X matrix method : str, default "ZCA" a string indicating which class of sphering to perform + return_numpy: bool, default False + option to return ndarray, instead of dataframe """ avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"] self.epsilon = epsilon self.center = center + self.return_numpy = return_numpy - assert ( - method in avail_methods - ), f"Error {method} not supported. Select one of {avail_methods}" + if method not in avail_methods: + raise ValueError( + f"Error {method} not supported. Select one of {avail_methods}" + ) self.method = method + # PCA-cor and ZCA-cor require center=True because we assumed we are + # only ever interested in computing centered Pearson correlation + # https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation + + if self.method in ["PCA-cor", "ZCA-cor"] and not self.center: + raise ValueError("PCA-cor and ZCA-cor require center=True") + def fit(self, X, y=None): """Identify the sphering transform given self.X @@ -67,58 +80,147 @@ def fit(self, X, y=None): self With computed weights attribute """ - # Get the mean of the features (columns) and center if specified - self.mu = X.mean() - if self.center: - X = X - self.mu - - # Get the covariance matrix - C = (1 / X.shape[0]) * np.dot(X.transpose(), X) - - if self.method in ["PCA", "ZCA"]: - # Get the eigenvalues and eigenvectors of the covariance matrix - s, U = eigh(C) - - # Fix sign ambiguity of eigenvectors - U = pd.DataFrame(U * np.sign(np.diag(U))) - - # Process the eigenvalues into a diagonal matrix and fix rounding errors - D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon))) - - # Calculate the sphering matrix - self.W = np.dot(D, U.transpose()) - - # If ZCA, perform additional rotation - if self.method == "ZCA": - self.W = np.dot(U, self.W) + # Get Numpy representation of the DataFrame + X = X.values if self.method in ["PCA-cor", "ZCA-cor"]: - # Get the correlation matrix - R = np.corrcoef(X.transpose()) + # The projection matrix for PCA-cor and ZCA-cor is the same as the + # projection matrix for PCA and ZCA, respectively, on the standardized + # data. So, we first standardize the data, then compute the projection - # Get the eigenvalues and eigenvectors of the correlation matrix - try: - t, G = eigh(R) - except ValueError: + self.standard_scaler = StandardScaler().fit(X) + variances = self.standard_scaler.var_ + if np.any(variances == 0): raise ValueError( "Divide by zero error, make sure low variance columns are removed" ) - # Fix sign ambiguity of eigenvectors - G = pd.DataFrame(G * np.sign(np.diag(G))) - - # Process the eigenvalues into a diagonal matrix and fix rounding errors - D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon))) - - # process the covariance diagonal matrix and fix rounding errors - v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon))) + X = self.standard_scaler.transform(X) + else: + if self.center: + self.mean_centerer = StandardScaler(with_mean=True, with_std=False).fit( + X + ) + X = self.mean_centerer.transform(X) + + # Get the number of observations and variables + n, d = X.shape + + # Checking the rank of matrix X considering the number of samples (n) and features (d). + r = np.linalg.matrix_rank(X) + + # Case 1: More features than samples (n < d). + # If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1. + # If not centered, the max rank is limited by n, as there can't be more independent vectors than samples. + # Case 2: More samples than features or equal (n >= d). + # Here, the max rank is limited by d (number of features), assuming each provides unique information. + + if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))): + raise ValueError( + ( + "The data matrix X is not full rank: n = {n}, d = {d}, r = {r}." + "Perfect linear dependencies are unusual in data matrices so something seems amiss." + "Check for linear dependencies in the data and remove them." + ) + ) + + # Get the eigenvalues and eigenvectors of the covariance matrix + # by computing the SVD on the data matrix + # https://stats.stackexchange.com/q/134282/8494 + _, Sigma, Vt = np.linalg.svd(X, full_matrices=True) + + # if n <= d then Sigma has shape (n,) so it will need to be expanded to + # d filled with the value r'th element of Sigma + if n <= d: + # Do some error checking + if Sigma.shape[0] != n: + error_detail = f"When n <= d, Sigma should have shape (n,) i.e. ({n}, ) but it is {Sigma.shape}" + context = ( + "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`" + ) + raise ValueError(f"{error_detail}. This is likely a bug in {context}.") - # Calculate the sphering matrix - self.W = np.dot(np.dot(D, G.transpose()), v) + if (r != n - 1) & (r != n): + error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}" + context = ( + "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`" + ) + raise ValueError(f"{error_detail}. This is likely a bug in {context}.") + + Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r))) + + Sigma = Sigma + self.epsilon + + # From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is + # + # W = Lambda^(-1/2) E^T + # + # where + # - E is the eigenvector matrix + # - Lambda is the (diagonal) eigenvalue matrix + # of cov(X), where X is the data matrix + # (i.e., cov(X) = E Lambda E^T) + # + # However, W can also be computed using the Singular Value Decomposition + # (SVD) of X. Assuming X is mean-centered (zero mean for each feature), + # the SVD of X is given by: + # + # X = U S V^T + # + # The covariance matrix of X can be expressed using its SVD: + # + # cov(X) = (X^T X) / (n - 1) + # = (V S^2 V^T) / (n - 1) + # + # By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify: + # + # E = V (eigenvectors) + # Lambda = S^2 / (n - 1) (eigenvalues) + # + # Thus, Lambda^(-1/2) can be expressed as: + # + # Lambda^(-1/2) = inv(S) * sqrt(n - 1) + # + # Therefore, the PCA Whitening matrix W becomes: + # + # W = Lambda^(-1/2) * E^T + # = (inv(S) * sqrt(n - 1)) * V^T + # = (inv(S) * V^T) * sqrt(n - 1) + # + # In NumPy, this is implemented as: + # + # W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1) + # + # But computing `np.linalg.inv(S)` is memory-intensive. + # + # A more memory-efficient alternative is: + # + # W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1) + # + # where Sigma contains the diagonal elements of S (singular values). + + self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1) + + # If ZCA, perform additional rotation + if self.method in ["ZCA", "ZCA-cor"]: + # Note: There was previously a requirement r==d otherwise the + # ZCA transform would not be well defined. However, it later appeared + # that this requirement was not necessary. + + self.W = self.W @ Vt + + # number of columns of self.W should be equal to that of X + assert ( + self.W.shape[1] == X.shape[1] + ), f"Error: W has {self.W.shape[1]} columns, X has {X.shape[1]} columns" - # If ZCA-cor, perform additional rotation - if self.method == "ZCA-cor": - self.W = np.dot(G, self.W) + if self.W.shape[1] != X.shape[1]: + error_detail = ( + f"The number of columns of W should be equal to that of X." + f"However, W has {self.W.shape[1]} columns, X has {X.shape[1]} columns" + ) + context = "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`" + raise ValueError(f"{error_detail}. This is likely a bug in {context}.") return self @@ -137,7 +239,29 @@ def transform(self, X, y=None): pandas.core.frame.DataFrame Spherized dataframe """ - return np.dot(X - self.mu, self.W.transpose()) + + columns = X.columns + + # Get Numpy representation of the DataFrame + X = X.values + + if self.method in ["PCA-cor", "ZCA-cor"]: + X = self.standard_scaler.transform(X) + else: + if self.center: + X = self.mean_centerer.transform(X) + + if self.method in ["PCA", "PCA-cor"]: + columns = ["PC" + str(i) for i in range(1, X.shape[1] + 1)] + + self.columns = columns + + XW = X @ self.W + + if self.return_numpy: + return XW + else: + return pd.DataFrame(XW, columns=columns) class RobustMAD(BaseEstimator, TransformerMixin): diff --git a/tests/test_normalize.py b/tests/test_normalize.py index e787ead0..ffbb07d6 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from pycytominer.normalize import normalize +import pytest random.seed(123) @@ -445,59 +446,74 @@ def test_normalize_standardize_allsamples_compress(): def test_normalize_spherize(): for spherize_method in ["ZCA", "PCA", "ZCA-cor", "PCA-cor"]: for spherize_center in [True, False]: - result = normalize( - data_spherize_df, - features=["a", "b", "c", "d"], - meta_features=["id"], - method="spherize", - spherize_method=spherize_method, - spherize_center=spherize_center, - ) - result_cov = ( - pd.DataFrame(np.cov(np.transpose(result.drop("id", axis="columns")))) - .round() - .sum() - .clip(1) - .sum() - ) - expected_result = data_spherize_df.shape[1] - 1 - assert result_cov == expected_result - - result = normalize( - data_spherize_df, - samples="id == 'control'", - features=["a", "b", "c", "d"], - meta_features=["id"], - method="spherize", - spherize_method=spherize_method, - spherize_center=spherize_center, - ) - result_cov = ( - np.cov( - np.transpose( - result.query("id == 'control'").drop("id", axis="columns") + if spherize_method in ["PCA-cor", "ZCA-cor"] and not spherize_center: + # verify that calling normalize will throw a ValueError + with pytest.raises(ValueError): + normalize( + data_spherize_df, + features=["a", "b", "c", "d"], + meta_features=["id"], + method="spherize", + spherize_method=spherize_method, + spherize_center=spherize_center, ) + + else: + result = normalize( + data_spherize_df, + features=["a", "b", "c", "d"], + meta_features=["id"], + method="spherize", + spherize_method=spherize_method, + spherize_center=spherize_center, + ) + result_cov = ( + pd.DataFrame( + np.cov(np.transpose(result.drop("id", axis="columns"))) + ) + .round() + .sum() + .clip(1) + .sum() + ) + expected_result = data_spherize_df.shape[1] - 1 + assert result_cov == expected_result + + result = normalize( + data_spherize_df, + samples="id == 'control'", + features=["a", "b", "c", "d"], + meta_features=["id"], + method="spherize", + spherize_method=spherize_method, + spherize_center=spherize_center, + ) + result_cov = ( + np.cov( + np.transpose( + result.query("id == 'control'").drop("id", axis="columns") + ) + ) + .round() + .sum() + .clip(1) + .sum() ) - .round() - .sum() - .clip(1) - .sum() - ) - # Add some tolerance to result b/c of low sample size - expected_result = data_spherize_df.shape[1] - assert result_cov < expected_result - - non_spherize_result_cov = ( - np.cov( - np.transpose( - result.query("id == 'treatment'").drop("id", axis="columns") + # Add some tolerance to result b/c of low sample size + expected_result = data_spherize_df.shape[1] + assert result_cov < expected_result + + non_spherize_result_cov = ( + np.cov( + np.transpose( + result.query("id == 'treatment'").drop("id", axis="columns") + ) ) + .round() + .sum() + .sum() ) - .round() - .sum() - .sum() - ) - assert non_spherize_result_cov >= expected_result - 5 + assert non_spherize_result_cov >= expected_result - 5 def test_spherize_epsilon(): diff --git a/tests/test_operations/test_transform.py b/tests/test_operations/test_transform.py index 25b914ff..4ee08ddc 100644 --- a/tests/test_operations/test_transform.py +++ b/tests/test_operations/test_transform.py @@ -22,6 +22,8 @@ def test_spherize(): spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"] for method in spherize_methods: for center in [True, False]: + if ["PCA-cor", "ZCA-cor"] and not center: + continue scaler = Spherize(method=method, center=center) scaler = scaler.fit(data_df) transform_df = scaler.transform(data_df) @@ -46,6 +48,8 @@ def test_low_variance_spherize(): spherize_methods = ["PCA-cor", "ZCA-cor"] for method in spherize_methods: for center in [True, False]: + if method in ["PCA-cor", "ZCA-cor"] and not center: + continue scaler = Spherize(method=method, center=center) with pytest.raises(ValueError) as errorinfo: scaler = scaler.fit(data_no_variance) @@ -57,6 +61,8 @@ def test_spherize_precenter(): data_precentered = data_df - data_df.mean() spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"] for method in spherize_methods: + if method in ["PCA-cor", "ZCA-cor"]: + continue scaler = Spherize(method=method, center=False) scaler = scaler.fit(data_precentered) transform_df = scaler.transform(data_df)