Skip to content

Commit

Permalink
clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
idc9 committed Oct 15, 2020
1 parent 8f686e0 commit 7a6f689
Show file tree
Hide file tree
Showing 12 changed files with 75 additions and 72 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,11 @@ Testing is done using nose.

We welcome contributions to make this a stronger package: data examples, bug fixes, spelling errors, new features, etc.


<!--
# Citation
You can use the below badg to generate a DOI and bibtex citation
You can use the below badge to generate a DOI and bibtex citation
[![DOI](https://zenodo.org/badge/TODO.svg)](https://zenodo.org/badge/latestdoi/TODO)
-->
32 changes: 12 additions & 20 deletions ya_pca/PCA.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

from warnings import warn

Expand Down Expand Up @@ -114,16 +115,10 @@ def fit(self, X):
# ensure_2d=True, copy=self.copy)

self.metadata_ = {'shape': X.shape}
# if isinstance(X, pd.DataFrame):
# sample_names = X.index.values
# var_names = X.columns.values

# possibly center X
if self.center:
# assert X is not sparse
X = np.array(X)
self.center_ = X.mean(axis=0)
X = X - self.center_

self.centerer_ = StandardScaler(copy=True, with_mean=self.center,
with_std=False)
X = self.centerer_.fit_transform(X)

# smallest rank the SVD can be
m = min(X.shape)
Expand Down Expand Up @@ -169,7 +164,7 @@ def fit(self, X):

start_time = time()
UDV, rank_est, out = select_rank(X=X, method=self.n_components,
UDV=UDV, **self.rank_sel_kws)
UDV=UDV, **self.rank_sel_kws)

self.metadata_['rank_sel_runtime'] = time() - start_time

Expand All @@ -185,7 +180,7 @@ def fit(self, X):
" to look at diagnostics for the rank selection method."
.format(rank_est))

# TODO: possibly get noise estimate
# TODO: get noise estimate

return self

Expand Down Expand Up @@ -267,16 +262,11 @@ def transform(self, X):
-------
X_new : array-like, shape (n_samples, n_components)
Examples
--------
TODO
"""
check_is_fitted(self, attributes='loadings_')

X = check_array(X)
if hasattr(self, 'center_'):
X = X - self.center_

X = self.centerer_.transform(X)
projections = np.dot(X, self.loadings_)
# if self.whiten:
# X_transformed /= np.sqrt(self.explained_variance_)
Expand All @@ -301,8 +291,9 @@ def inverse_transform(self, X):
# self.components_) + self.mean_
# else:
Y = np.dot(X, self.loadings_)
if hasattr(self, 'center_'):
Y = Y + self.center_
m = self.centerer_.mean_
if m is not None:
Y = Y + m
return Y

def score_samples(self, X):
Expand Down Expand Up @@ -341,6 +332,7 @@ def score_samples_log_lik(self, X):
Log-likelihood of each sample under the current model.
"""
raise NotImplementedError
# TODO: implement this!
check_is_fitted(self, 'loadings_')

X = check_array(X)
Expand Down
21 changes: 0 additions & 21 deletions ya_pca/rank_selection/base.py

This file was deleted.

2 changes: 1 addition & 1 deletion ya_pca/rank_selection/bi_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def select_rank_bi_cv(X, max_rank=None, krow=2, kcol=2,
random_state=None, rotate=False):
"""
Estimates the PCA rank using the bi-cross validation method discussed in
(Owen and Perry, 2009)
(Owen and Perry, 2009).
Parameters
----------
Expand Down
4 changes: 1 addition & 3 deletions ya_pca/rank_selection/marcenko_pastur.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from scipy.optimize import root_scalar
import numpy as np

# TODO: document beta


def get_mp_pdf(beta):
"""
Expand All @@ -12,7 +10,7 @@ def get_mp_pdf(beta):
Parameters
----------
beta: float
TODO
TODO: document
Output
------
Expand Down
3 changes: 1 addition & 2 deletions ya_pca/rank_selection/minka.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def select_rank_minka(shape, svals):
"""
Selects the PCA rank using the method from (Minka, 2000)
Selects the PCA rank using the method from (Minka, 2000).
Parameters
----------
Expand All @@ -18,7 +18,6 @@ def select_rank_minka(shape, svals):
svals: array-like, (n_features, )
All singular values of the data matrix X.
Output
------
rank_est, out
Expand Down
1 change: 1 addition & 0 deletions ya_pca/rank_selection/noise_estimates.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def estimate_noise_shabalin_nobel(X, UDV=None):
Estimates the noise level using the method presented in Section 4.1
of (Shablin and Nobel, 2010)
"""
# TODO: implement this!
raise NotImplementedError


Expand Down
4 changes: 1 addition & 3 deletions ya_pca/rank_selection/rmt_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ def select_rank_rmt_threshold(X, thresh_method='dg',
noise_est='mp',
noise_est_kwargs={},
UDV=None):

"""
Selects the PCA rank using one of the random matrix theory based thresholds.
Expand Down Expand Up @@ -72,7 +71,7 @@ def select_rank_rmt_threshold(X, thresh_method='dg',

def marcenko_pastur_edge_threshold(shape, sigma):
"""
See (Gavish and Donoho, 2014)
See (Gavish and Donoho, 2014).
Parameters
----------
Expand All @@ -82,7 +81,6 @@ def marcenko_pastur_edge_threshold(shape, sigma):
sigma: float
Estiamte of the noise standard deviation.
Output
------
sigular_value_threshold: float
Expand Down
8 changes: 6 additions & 2 deletions ya_pca/rank_selection/soft_impute_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from sklearn.model_selection import KFold
from joblib import Parallel, delayed

from fancyimpute import SoftImpute
# from fancyimpute import SoftImpute


def soft_impute_cv(X, K=5, n_lambd_vals=100,
max_iters=100, convergence_threshold=0.001,
max_rank=None, verbose=False,
n_jobs=None, backend=None):
"""
Performs Wold style cross-validation to select the lambda value for the nuclear norm penalized version of PCA. See (Choi et al, 2017)
Performs Wold style cross-validation to select the lambda value for the nuclear norm penalized version of PCA. See (Choi et al, 2017).
Parameters
----------
Expand Down Expand Up @@ -42,6 +42,10 @@ def soft_impute_cv(X, K=5, n_lambd_vals=100,
Detailed cross-validation output.
"""

# only import from fancyimpute if we actually need it
# TODO-HACK: figure out better way to deal with this
from fancyimpute import SoftImpute

assert X.ndim == 2
lambd_max = np.linalg.norm(X, ord=2) # largest sval
lambd_vals = np.linspace(start=0, stop=lambd_max, num=n_lambd_vals)
Expand Down
4 changes: 1 addition & 3 deletions ya_pca/rank_selection/wold_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
import pandas as pd
import matplotlib.pyplot as plt


from ya_pca.svd_missing import svd_missing
from ya_pca.linalg_utils import rand_orthog


def select_rank_wold_cv(X, max_rank, p_holdout=0.3, opt_kws={}, n_folds=5,
random_state=None, rotate=False):
"""
Estimates the PCA rank using the Wold style cross- validation method discussed in (Owen and Perry, 2009)
Estimates the PCA rank using the Wold style cross- validation method discussed in (Owen and Perry, 2009).
Parameters
----------
Expand All @@ -30,7 +29,6 @@ def select_rank_wold_cv(X, max_rank, p_holdout=0.3, opt_kws={}, n_folds=5,
n_folds: int
Number of cross-validation folds.
random_state: None, int
Random seed.
Expand Down
2 changes: 0 additions & 2 deletions ya_pca/svd_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,3 @@ def censored_lstsq(A, B, M):
r = T.shape[1]
T[:, np.arange(r), np.arange(r)] += 1e-6
return np.squeeze(np.linalg.solve(T, rhs), axis=-1).T


61 changes: 48 additions & 13 deletions ya_pca/toy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,16 @@


def rand_factor_model(n_samples=100, n_features=20,
rank=10,
m=1.5,
sigma=1.0, random_state=None):
rank=5, m=1.5, noise_std=1.0, random_state=None):
"""
X = sqrt(n) UV^T + E
U = scores (n_samples x rank) has iid normal entries
V = loadings (n_features x rank) is a column orthonormal matrix
E (n_samples x n_features) has iid N(0, sigma^2) entries
Samples from the matrix factorization model
X = U diag(svals) V^T + E
where U,V are orthonormal matrices. The svals are linearly increasing following the Choi et al. 2017 section 2.2.3. The noise matrix has iid N(0, noise_std^2) entries.
Follows Choi et al. 2017 section 2.2.3.
Parameters
----------
n_samples: int
Expand All @@ -34,7 +33,7 @@ def rand_factor_model(n_samples=100, n_features=20,
m: float
Signal strength.
sigma: float
noise_std: float
Standard deviation of the noise.
random_state: None, int
Expand All @@ -57,10 +56,13 @@ def rand_factor_model(n_samples=100, n_features=20,
U = rand_orthog(n_samples, rank, random_state=rng)
V = rand_orthog(n_features, rank, random_state=rng)

svals = m * np.arange(1, rank + 1) * sigma * (n_samples * n_features) ** (.25)
svals = np.sort(svals)[::-1]
svals = lin_spaced_svals(n_samples=n_samples,
n_features=n_features,
rank=rank,
sigma_sq=noise_std ** 2,
m=m)

E = rng.normal(size=(n_samples, n_features), scale=sigma)
E = rng.normal(size=(n_samples, n_features), scale=noise_std)

X = (U * svals).dot(V.T) + E

Expand All @@ -71,10 +73,43 @@ def rand_factor_model(n_samples=100, n_features=20,
'E': E}


def lin_spaced_svals(n_samples, n_features, rank, sigma_sq=1, m=1.5):
"""
Linearly spaced singular values from equation (2.15) of (Choi et al, 2017).
Parameters
----------
n_samples: int
Number of samples.
n_features: int
Number of features.
rank: int
The rank of the low rank signal matrix.
sigma_sq: float
Noise level.
m: float
Signal strength. For m<1 we do not expect to be able to find the signal.
Output
------
svals: list of floats, (rank, )
The singular values in decreasing order.
"""

sval_base = m * np.sqrt(sigma_sq) * (n_samples * n_features) ** (.25)
svals = np.arange(1, rank + 1) * sval_base
svals = np.sort(svals)[::-1]
return svals


def perry_sim_dist(strong=True, sparse=False, noise='white',
random_state=None):
"""
Toy PCA data example from Section 5.4 of (Perry, 2009)
Toy PCA data example from simulations in Section 5.4 of (Perry, 2009).
Parameters
----------
Expand Down

0 comments on commit 7a6f689

Please sign in to comment.