Skip to content

Commit

Permalink
Fix Spherize – use SVD, simplify calculations (#320)
Browse files Browse the repository at this point in the history
* #319

* Rewrite spherize

* revert

* black

* skip test

* comment

* docs

* Handle types

* flag zero var

* skip some conditions

* typo

* black

* typo

* checks

* Handle low rank

* fix size check

* prompt user to report bug

* change assertions to exceptions

* Use return_numpy in Spherize

* Fix typo

* handle change of column names

* add test

* test centering

* change assertions to exceptions, and explain better

* clean up error messages, remove requirement for ZCA

* Drop `create_bug_report_message`

* Update error message

* Math clarifications

* Clarify why we don't use np.linalg.inv(S)

* black missed this
  • Loading branch information
shntnu authored Jan 26, 2024
1 parent cf313bf commit 8541374
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 106 deletions.
32 changes: 23 additions & 9 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler

from pycytominer.cyto_utils import (
output,
infer_cp_features,
load_profiles,
)
from pycytominer.cyto_utils import output, infer_cp_features, load_profiles, util
from pycytominer.operations import Spherize, RobustMAD


Expand Down Expand Up @@ -141,7 +137,10 @@ def normalize(
scaler = RobustMAD(epsilon=mad_robustize_epsilon)
elif method == "spherize":
scaler = Spherize(
center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
center=spherize_center,
method=spherize_method,
epsilon=spherize_epsilon,
return_numpy=True,
)

if features == "infer":
Expand All @@ -161,15 +160,30 @@ def normalize(
# Subset to only the features measured in the sample query
fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

# Scale the feature dataframe
fitted_scaled = fitted_scaler.transform(feature_df)

columns = fitted_scaler.columns if method == "spherize" else feature_df.columns

feature_df = pd.DataFrame(
fitted_scaler.transform(feature_df),
columns=feature_df.columns,
fitted_scaled,
columns=columns,
index=feature_df.index,
)

normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

if feature_df.shape != profiles.loc[:, features].shape:
error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe"
context = f"the `{method}` method in `pycytominer.normalize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}")

if (normalized.shape[0] != profiles.shape[0]) or (
normalized.shape[1] != len(features) + len(meta_features)
):
error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe"
context = f"the `{method}` method in `pycytominer.normalize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

if output_file != None:
output(
df=normalized,
Expand Down
220 changes: 172 additions & 48 deletions pycytominer/operations/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from scipy.linalg import eigh
from scipy.stats import median_abs_deviation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from pycytominer.cyto_utils import util


class Spherize(BaseEstimator, TransformerMixin):
Expand All @@ -33,7 +35,7 @@ class Spherize(BaseEstimator, TransformerMixin):
a string indicating which class of sphering to perform
"""

def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False):
"""
Parameters
----------
Expand All @@ -43,17 +45,28 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
option to center the input X matrix
method : str, default "ZCA"
a string indicating which class of sphering to perform
return_numpy: bool, default False
option to return ndarray, instead of dataframe
"""
avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]

self.epsilon = epsilon
self.center = center
self.return_numpy = return_numpy

assert (
method in avail_methods
), f"Error {method} not supported. Select one of {avail_methods}"
if method not in avail_methods:
raise ValueError(
f"Error {method} not supported. Select one of {avail_methods}"
)
self.method = method

# PCA-cor and ZCA-cor require center=True because we assumed we are
# only ever interested in computing centered Pearson correlation
# https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation

if self.method in ["PCA-cor", "ZCA-cor"] and not self.center:
raise ValueError("PCA-cor and ZCA-cor require center=True")

def fit(self, X, y=None):
"""Identify the sphering transform given self.X
Expand All @@ -67,58 +80,147 @@ def fit(self, X, y=None):
self
With computed weights attribute
"""
# Get the mean of the features (columns) and center if specified
self.mu = X.mean()
if self.center:
X = X - self.mu

# Get the covariance matrix
C = (1 / X.shape[0]) * np.dot(X.transpose(), X)

if self.method in ["PCA", "ZCA"]:
# Get the eigenvalues and eigenvectors of the covariance matrix
s, U = eigh(C)

# Fix sign ambiguity of eigenvectors
U = pd.DataFrame(U * np.sign(np.diag(U)))

# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))

# Calculate the sphering matrix
self.W = np.dot(D, U.transpose())

# If ZCA, perform additional rotation
if self.method == "ZCA":
self.W = np.dot(U, self.W)
# Get Numpy representation of the DataFrame
X = X.values

if self.method in ["PCA-cor", "ZCA-cor"]:
# Get the correlation matrix
R = np.corrcoef(X.transpose())
# The projection matrix for PCA-cor and ZCA-cor is the same as the
# projection matrix for PCA and ZCA, respectively, on the standardized
# data. So, we first standardize the data, then compute the projection

# Get the eigenvalues and eigenvectors of the correlation matrix
try:
t, G = eigh(R)
except ValueError:
self.standard_scaler = StandardScaler().fit(X)
variances = self.standard_scaler.var_
if np.any(variances == 0):
raise ValueError(
"Divide by zero error, make sure low variance columns are removed"
)

# Fix sign ambiguity of eigenvectors
G = pd.DataFrame(G * np.sign(np.diag(G)))

# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon)))

# process the covariance diagonal matrix and fix rounding errors
v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))
X = self.standard_scaler.transform(X)
else:
if self.center:
self.mean_centerer = StandardScaler(with_mean=True, with_std=False).fit(
X
)
X = self.mean_centerer.transform(X)

# Get the number of observations and variables
n, d = X.shape

# Checking the rank of matrix X considering the number of samples (n) and features (d).
r = np.linalg.matrix_rank(X)

# Case 1: More features than samples (n < d).
# If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1.
# If not centered, the max rank is limited by n, as there can't be more independent vectors than samples.
# Case 2: More samples than features or equal (n >= d).
# Here, the max rank is limited by d (number of features), assuming each provides unique information.

if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))):
raise ValueError(
(
"The data matrix X is not full rank: n = {n}, d = {d}, r = {r}."
"Perfect linear dependencies are unusual in data matrices so something seems amiss."
"Check for linear dependencies in the data and remove them."
)
)

# Get the eigenvalues and eigenvectors of the covariance matrix
# by computing the SVD on the data matrix
# https://stats.stackexchange.com/q/134282/8494
_, Sigma, Vt = np.linalg.svd(X, full_matrices=True)

# if n <= d then Sigma has shape (n,) so it will need to be expanded to
# d filled with the value r'th element of Sigma
if n <= d:
# Do some error checking
if Sigma.shape[0] != n:
error_detail = f"When n <= d, Sigma should have shape (n,) i.e. ({n}, ) but it is {Sigma.shape}"
context = (
"the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

# Calculate the sphering matrix
self.W = np.dot(np.dot(D, G.transpose()), v)
if (r != n - 1) & (r != n):
error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}"
context = (
"the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r)))

Sigma = Sigma + self.epsilon

# From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is
#
# W = Lambda^(-1/2) E^T
#
# where
# - E is the eigenvector matrix
# - Lambda is the (diagonal) eigenvalue matrix
# of cov(X), where X is the data matrix
# (i.e., cov(X) = E Lambda E^T)
#
# However, W can also be computed using the Singular Value Decomposition
# (SVD) of X. Assuming X is mean-centered (zero mean for each feature),
# the SVD of X is given by:
#
# X = U S V^T
#
# The covariance matrix of X can be expressed using its SVD:
#
# cov(X) = (X^T X) / (n - 1)
# = (V S^2 V^T) / (n - 1)
#
# By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify:
#
# E = V (eigenvectors)
# Lambda = S^2 / (n - 1) (eigenvalues)
#
# Thus, Lambda^(-1/2) can be expressed as:
#
# Lambda^(-1/2) = inv(S) * sqrt(n - 1)
#
# Therefore, the PCA Whitening matrix W becomes:
#
# W = Lambda^(-1/2) * E^T
# = (inv(S) * sqrt(n - 1)) * V^T
# = (inv(S) * V^T) * sqrt(n - 1)
#
# In NumPy, this is implemented as:
#
# W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1)
#
# But computing `np.linalg.inv(S)` is memory-intensive.
#
# A more memory-efficient alternative is:
#
# W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1)
#
# where Sigma contains the diagonal elements of S (singular values).

self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1)

# If ZCA, perform additional rotation
if self.method in ["ZCA", "ZCA-cor"]:
# Note: There was previously a requirement r==d otherwise the
# ZCA transform would not be well defined. However, it later appeared
# that this requirement was not necessary.

self.W = self.W @ Vt

# number of columns of self.W should be equal to that of X
assert (
self.W.shape[1] == X.shape[1]
), f"Error: W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"

# If ZCA-cor, perform additional rotation
if self.method == "ZCA-cor":
self.W = np.dot(G, self.W)
if self.W.shape[1] != X.shape[1]:
error_detail = (
f"The number of columns of W should be equal to that of X."
f"However, W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
)
context = "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

return self

Expand All @@ -137,7 +239,29 @@ def transform(self, X, y=None):
pandas.core.frame.DataFrame
Spherized dataframe
"""
return np.dot(X - self.mu, self.W.transpose())

columns = X.columns

# Get Numpy representation of the DataFrame
X = X.values

if self.method in ["PCA-cor", "ZCA-cor"]:
X = self.standard_scaler.transform(X)
else:
if self.center:
X = self.mean_centerer.transform(X)

if self.method in ["PCA", "PCA-cor"]:
columns = ["PC" + str(i) for i in range(1, X.shape[1] + 1)]

self.columns = columns

XW = X @ self.W

if self.return_numpy:
return XW
else:
return pd.DataFrame(XW, columns=columns)


class RobustMAD(BaseEstimator, TransformerMixin):
Expand Down
Loading

0 comments on commit 8541374

Please sign in to comment.