Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Spherize – use SVD, simplify calculations #320

Merged
merged 32 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7ea8197
https://github.com/cytomining/pycytominer/issues/319
shntnu Sep 9, 2023
89cd244
Rewrite spherize
shntnu Sep 9, 2023
4881616
revert
shntnu Sep 9, 2023
2e9239f
black
shntnu Sep 9, 2023
7a0e51d
skip test
shntnu Sep 9, 2023
47e4e9e
comment
shntnu Sep 9, 2023
b5e1767
docs
shntnu Sep 9, 2023
c847f15
Handle types
shntnu Sep 9, 2023
bc47aa7
flag zero var
shntnu Sep 9, 2023
5f806f7
skip some conditions
shntnu Sep 9, 2023
66993dc
typo
shntnu Sep 9, 2023
804dc16
black
shntnu Sep 9, 2023
3554c32
typo
shntnu Sep 9, 2023
8d6648e
checks
shntnu Sep 10, 2023
78dcb54
Handle low rank
shntnu Sep 10, 2023
33cbb30
fix size check
shntnu Sep 10, 2023
525cbb5
Merge branch 'cytomining:main' into spherize_fix
shntnu Dec 9, 2023
d1f580c
prompt user to report bug
shntnu Dec 9, 2023
351da3a
change assertions to exceptions
shntnu Dec 9, 2023
99143d3
Use return_numpy in Spherize
shntnu Dec 9, 2023
b771fc7
Fix typo
shntnu Dec 9, 2023
1d9cf55
handle change of column names
shntnu Dec 9, 2023
73f8d62
add test
shntnu Dec 10, 2023
1cf9032
test centering
shntnu Dec 10, 2023
039e515
change assertions to exceptions, and explain better
shntnu Dec 10, 2023
e7ef835
clean up error messages, remove requirement for ZCA
shntnu Dec 11, 2023
05c4567
Drop `create_bug_report_message`
shntnu Jan 8, 2024
848cd18
Update error message
shntnu Jan 8, 2024
bff48c9
Math clarifications
shntnu Jan 9, 2024
82021d7
Merge branch 'cytomining:main' into spherize_fix
shntnu Jan 12, 2024
e6f5bc2
Clarify why we don't use np.linalg.inv(S)
shntnu Jan 20, 2024
98f8d6f
black missed this
shntnu Jan 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler

from pycytominer.cyto_utils import (
output,
infer_cp_features,
load_profiles,
)
from pycytominer.cyto_utils import output, infer_cp_features, load_profiles, util
from pycytominer.operations import Spherize, RobustMAD


Expand Down Expand Up @@ -141,7 +137,10 @@ def normalize(
scaler = RobustMAD(epsilon=mad_robustize_epsilon)
elif method == "spherize":
scaler = Spherize(
center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
center=spherize_center,
method=spherize_method,
epsilon=spherize_epsilon,
return_numpy=True,
)

if features == "infer":
Expand All @@ -161,15 +160,30 @@ def normalize(
# Subset to only the features measured in the sample query
fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

# Scale the feature dataframe
fitted_scaled = fitted_scaler.transform(feature_df)

columns = fitted_scaler.columns if method == "spherize" else feature_df.columns

feature_df = pd.DataFrame(
fitted_scaler.transform(feature_df),
columns=feature_df.columns,
fitted_scaled,
columns=columns,
index=feature_df.index,
)

normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

if feature_df.shape != profiles.loc[:, features].shape:
error_detail = "The number of rows and columns in the feature dataframe does not match the original dataframe"
context = f"the `{method}` method in `pycytominer.normalize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}")

if (normalized.shape[0] != profiles.shape[0]) or (
normalized.shape[1] != len(features) + len(meta_features)
):
error_detail = "The number of rows and columns in the normalized dataframe does not match the original dataframe"
context = f"the `{method}` method in `pycytominer.normalize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

if output_file != None:
output(
df=normalized,
Expand Down
218 changes: 170 additions & 48 deletions pycytominer/operations/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from scipy.linalg import eigh
from scipy.stats import median_abs_deviation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from pycytominer.cyto_utils import util


class Spherize(BaseEstimator, TransformerMixin):
Expand All @@ -33,7 +35,7 @@ class Spherize(BaseEstimator, TransformerMixin):
a string indicating which class of sphering to perform
"""

def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False):
"""
Parameters
----------
Expand All @@ -43,17 +45,28 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
option to center the input X matrix
method : str, default "ZCA"
a string indicating which class of sphering to perform
return_numpy: bool, default False
option to return ndarray, instead of dataframe
"""
avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]

self.epsilon = epsilon
self.center = center
self.return_numpy = return_numpy

assert (
method in avail_methods
), f"Error {method} not supported. Select one of {avail_methods}"
if method not in avail_methods:
raise ValueError(
f"Error {method} not supported. Select one of {avail_methods}"
)
self.method = method

# PCA-cor and ZCA-cor require center=True because we assumed we are
# only ever interested in computing centered Pearson correlation
# https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation

if self.method in ["PCA-cor", "ZCA-cor"] and not self.center:
raise ValueError("PCA-cor and ZCA-cor require center=True")

def fit(self, X, y=None):
"""Identify the sphering transform given self.X

Expand All @@ -67,58 +80,145 @@ def fit(self, X, y=None):
self
With computed weights attribute
"""
# Get the mean of the features (columns) and center if specified
self.mu = X.mean()
if self.center:
X = X - self.mu

# Get the covariance matrix
C = (1 / X.shape[0]) * np.dot(X.transpose(), X)

if self.method in ["PCA", "ZCA"]:
# Get the eigenvalues and eigenvectors of the covariance matrix
s, U = eigh(C)

# Fix sign ambiguity of eigenvectors
U = pd.DataFrame(U * np.sign(np.diag(U)))

# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))

# Calculate the sphering matrix
self.W = np.dot(D, U.transpose())

# If ZCA, perform additional rotation
if self.method == "ZCA":
self.W = np.dot(U, self.W)
# Get Numpy representation of the DataFrame
X = X.values

if self.method in ["PCA-cor", "ZCA-cor"]:
# Get the correlation matrix
R = np.corrcoef(X.transpose())
# The projection matrix for PCA-cor and ZCA-cor is the same as the
# projection matrix for PCA and ZCA, respectively, on the standardized
# data. So, we first standardize the data, then compute the projection

# Get the eigenvalues and eigenvectors of the correlation matrix
try:
t, G = eigh(R)
except ValueError:
self.standard_scaler = StandardScaler().fit(X)
variances = self.standard_scaler.var_
if np.any(variances == 0):
raise ValueError(
"Divide by zero error, make sure low variance columns are removed"
)

# Fix sign ambiguity of eigenvectors
G = pd.DataFrame(G * np.sign(np.diag(G)))

# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(t.clip(self.epsilon)))

# process the covariance diagonal matrix and fix rounding errors
v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))
X = self.standard_scaler.transform(X)
else:
if self.center:
self.mean_centerer = StandardScaler(with_mean=True, with_std=False).fit(
X
)
X = self.mean_centerer.transform(X)

# Get the number of observations and variables
n, d = X.shape

# Checking the rank of matrix X considering the number of samples (n) and features (d).
r = np.linalg.matrix_rank(X)

# Case 1: More features than samples (n < d).
# If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1.
# If not centered, the max rank is limited by n, as there can't be more independent vectors than samples.
# Case 2: More samples than features or equal (n >= d).
# Here, the max rank is limited by d (number of features), assuming each provides unique information.

if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))):
raise ValueError(
(
"The data matrix X is not full rank: n = {n}, d = {d}, r = {r}."
"Perfect linear dependencies are unusual in data matrices so something seems amiss."
"Check for linear dependencies in the data and remove them."
)
)

# Get the eigenvalues and eigenvectors of the covariance matrix
# by computing the SVD on the data matrix
# https://stats.stackexchange.com/q/134282/8494
_, Sigma, Vt = np.linalg.svd(X, full_matrices=True)

# if n <= d then Sigma has shape (n,) so it will need to be expanded to
# d filled with the value r'th element of Sigma
if n <= d:
# Do some error checking
if Sigma.shape[0] != n:
error_detail = f"When n <= d, Sigma should have shape (n,) i.e. ({n}, ) but it is {Sigma.shape}"
context = (
"the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

# Calculate the sphering matrix
self.W = np.dot(np.dot(D, G.transpose()), v)
if (r != n - 1) & (r != n):
error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}"
context = (
"the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r)))

Sigma = Sigma + self.epsilon

# From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is
#
# W = Lambda^(-1/2) E^T
#
# where
# - E is the eigenvector matrix
# - Lambda is the (diagonal) eigenvalue matrix
# of cov(X), where X is the data matrix
# (i.e., cov(X) = E Lambda E^T)
#
# However, W can also be computed using the Singular Value Decomposition
# (SVD) of X. Assuming X is mean-centered (zero mean for each feature),
# the SVD of X is given by:
#
# X = U S V^T
#
# The covariance matrix of X can be expressed using its SVD:
#
# cov(X) = (X^T X) / (n - 1)
# = (V S^2 V^T) / (n - 1)
#
# By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify:
#
# E = V (eigenvectors)
# Lambda = S^2 / (n - 1) (eigenvalues)
#
# Thus, Lambda^(-1/2) can be expressed as:
#
# Lambda^(-1/2) = inv(S) * sqrt(n - 1)
#
# Therefore, the PCA Whitening matrix W becomes:
#
# W = Lambda^(-1/2) * E^T
# = (inv(S) * sqrt(n - 1)) * V^T
# = (inv(S) * V^T) * sqrt(n - 1)
#
# In NumPy, this is implemented as:
#
# W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1)
#
# Alternatively, this can be expressed as:
#
# W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1)
#
# where Sigma contains the diagonal elements of S (singular values).

self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1)

# If ZCA, perform additional rotation
if self.method in ["ZCA", "ZCA-cor"]:
# Note: There was previously a requirement r==d otherwise the
# ZCA transform would not be well defined. However, it later appeared
# that this requirement was not necessary.

self.W = self.W @ Vt

# number of columns of self.W should be equal to that of X
assert (
self.W.shape[1] == X.shape[1]
), f"Error: W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"

# If ZCA-cor, perform additional rotation
if self.method == "ZCA-cor":
self.W = np.dot(G, self.W)
if self.W.shape[1] != X.shape[1]:
error_detail = (
f"The number of columns of W should be equal to that of X."
f"However, W has {self.W.shape[1]} columns, X has {X.shape[1]} columns"
)
context = "the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

return self

Expand All @@ -137,7 +237,29 @@ def transform(self, X, y=None):
pandas.core.frame.DataFrame
Spherized dataframe
"""
return np.dot(X - self.mu, self.W.transpose())

columns = X.columns

# Get Numpy representation of the DataFrame
X = X.values

if self.method in ["PCA-cor", "ZCA-cor"]:
X = self.standard_scaler.transform(X)
else:
if self.center:
X = self.mean_centerer.transform(X)

if self.method in ["PCA", "PCA-cor"]:
columns = ["PC" + str(i) for i in range(1, X.shape[1] + 1)]

self.columns = columns

XW = X @ self.W

if self.return_numpy:
return XW
else:
return pd.DataFrame(XW, columns=columns)


class RobustMAD(BaseEstimator, TransformerMixin):
Expand Down
Loading