clean up

idc9 · Oct 15, 2020 · 7a6f689 · 7a6f689
1 parent 8f686e0
commit 7a6f689
Show file tree

Hide file tree

Showing 12 changed files with 75 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -60,10 +60,11 @@ Testing is done using nose.
 
 We welcome contributions to make this a stronger package: data examples, bug fixes, spelling errors, new features, etc.
 
-
+<!--
 # Citation
 
-You can use the below badg to generate a DOI and bibtex citation
+You can use the below badge to generate a DOI and bibtex citation
 
  [![DOI](https://zenodo.org/badge/TODO.svg)](https://zenodo.org/badge/latestdoi/TODO)
 
+-->
diff --git a/ya_pca/PCA.py b/ya_pca/PCA.py
@@ -6,6 +6,7 @@
 from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import StandardScaler
 
 from warnings import warn
 
@@ -114,16 +115,10 @@ def fit(self, X):
         #                         ensure_2d=True, copy=self.copy)
 
         self.metadata_ = {'shape': X.shape}
-        # if isinstance(X, pd.DataFrame):
-        #     sample_names = X.index.values
-        #     var_names = X.columns.values
-
-        # possibly center X
-        if self.center:
-            # assert X is not sparse
-            X = np.array(X)
-            self.center_ = X.mean(axis=0)
-            X = X - self.center_
+
+        self.centerer_ = StandardScaler(copy=True, with_mean=self.center,
+                                        with_std=False)
+        X = self.centerer_.fit_transform(X)
 
         # smallest rank the SVD can be
         m = min(X.shape)
@@ -169,7 +164,7 @@ def fit(self, X):
 
             start_time = time()
             UDV, rank_est, out = select_rank(X=X, method=self.n_components,
-                                              UDV=UDV, **self.rank_sel_kws)
+                                             UDV=UDV, **self.rank_sel_kws)
 
             self.metadata_['rank_sel_runtime'] = time() - start_time
 
@@ -185,7 +180,7 @@ def fit(self, X):
                      " to look at diagnostics for the rank selection method."
                      .format(rank_est))
 
-        # TODO: possibly get noise estimate
+        # TODO: get noise estimate
 
         return self
 
@@ -267,16 +262,11 @@ def transform(self, X):
         -------
         X_new : array-like, shape (n_samples, n_components)
 
-        Examples
-        --------
-        TODO
         """
         check_is_fitted(self, attributes='loadings_')
 
         X = check_array(X)
-        if hasattr(self, 'center_'):
-            X = X - self.center_
-
+        X = self.centerer_.transform(X)
         projections = np.dot(X, self.loadings_)
         # if self.whiten:
         #     X_transformed /= np.sqrt(self.explained_variance_)
@@ -301,8 +291,9 @@ def inverse_transform(self, X):
         #                     self.components_) + self.mean_
         # else:
         Y = np.dot(X, self.loadings_)
-        if hasattr(self, 'center_'):
-            Y = Y + self.center_
+        m = self.centerer_.mean_
+        if m is not None:
+            Y = Y + m
         return Y
 
     def score_samples(self, X):
@@ -341,6 +332,7 @@ def score_samples_log_lik(self, X):
             Log-likelihood of each sample under the current model.
         """
         raise NotImplementedError
+        # TODO: implement this!
         check_is_fitted(self, 'loadings_')
 
         X = check_array(X)

diff --git a/ya_pca/rank_selection/base.py b/ya_pca/rank_selection/base.py
diff --git a/ya_pca/rank_selection/bi_cv.py b/ya_pca/rank_selection/bi_cv.py
@@ -12,7 +12,7 @@ def select_rank_bi_cv(X, max_rank=None, krow=2, kcol=2,
                       random_state=None, rotate=False):
     """
     Estimates the PCA rank using the bi-cross validation method discussed in
-    (Owen and Perry, 2009)
+    (Owen and Perry, 2009).
 
     Parameters
     ----------

diff --git a/ya_pca/rank_selection/marcenko_pastur.py b/ya_pca/rank_selection/marcenko_pastur.py
@@ -2,8 +2,6 @@
 from scipy.optimize import root_scalar
 import numpy as np
 
-# TODO: document beta
-
 
 def get_mp_pdf(beta):
     """
@@ -12,7 +10,7 @@ def get_mp_pdf(beta):
     Parameters
     ----------
     beta: float
-        TODO
+        TODO: document
 
     Output
     ------

diff --git a/ya_pca/rank_selection/minka.py b/ya_pca/rank_selection/minka.py
@@ -9,7 +9,7 @@
 
 def select_rank_minka(shape, svals):
     """
-    Selects the PCA rank using the method from (Minka, 2000)
+    Selects the PCA rank using the method from (Minka, 2000).
 
     Parameters
     ----------
@@ -18,7 +18,6 @@ def select_rank_minka(shape, svals):
     svals: array-like, (n_features, )
         All singular values of the data matrix X.
 
-
     Output
     ------
     rank_est, out

diff --git a/ya_pca/rank_selection/noise_estimates.py b/ya_pca/rank_selection/noise_estimates.py
@@ -193,6 +193,7 @@ def estimate_noise_shabalin_nobel(X, UDV=None):
     Estimates the noise level using the method presented in Section 4.1
     of (Shablin and Nobel, 2010)
     """
+    # TODO: implement this!
     raise NotImplementedError
 
 

diff --git a/ya_pca/rank_selection/rmt_threshold.py b/ya_pca/rank_selection/rmt_threshold.py
@@ -9,7 +9,6 @@ def select_rank_rmt_threshold(X, thresh_method='dg',
                               noise_est='mp',
                               noise_est_kwargs={},
                               UDV=None):
-
     """
     Selects the PCA rank using one of the random matrix theory based thresholds.
 
@@ -72,7 +71,7 @@ def select_rank_rmt_threshold(X, thresh_method='dg',
 
 def marcenko_pastur_edge_threshold(shape, sigma):
     """
-    See (Gavish and Donoho, 2014)
+    See (Gavish and Donoho, 2014).
 
     Parameters
     ----------
@@ -82,7 +81,6 @@ def marcenko_pastur_edge_threshold(shape, sigma):
     sigma: float
         Estiamte of the noise standard deviation.
 
-
     Output
     ------
     sigular_value_threshold: float

diff --git a/ya_pca/rank_selection/soft_impute_cv.py b/ya_pca/rank_selection/soft_impute_cv.py
@@ -3,15 +3,15 @@
 from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 
-from fancyimpute import SoftImpute
+# from fancyimpute import SoftImpute
 
 
 def soft_impute_cv(X, K=5, n_lambd_vals=100,
                    max_iters=100, convergence_threshold=0.001,
                    max_rank=None, verbose=False,
                    n_jobs=None, backend=None):
     """
-    Performs Wold style cross-validation to select the lambda value for the nuclear norm penalized version of PCA. See (Choi et al, 2017)
+    Performs Wold style cross-validation to select the lambda value for the nuclear norm penalized version of PCA. See (Choi et al, 2017).
 
     Parameters
     ----------
@@ -42,6 +42,10 @@ def soft_impute_cv(X, K=5, n_lambd_vals=100,
         Detailed cross-validation output.
     """
 
+    # only import from fancyimpute if we actually need it
+    # TODO-HACK: figure out better way to deal with this
+    from fancyimpute import SoftImpute
+
     assert X.ndim == 2
     lambd_max = np.linalg.norm(X, ord=2)  # largest sval
     lambd_vals = np.linspace(start=0, stop=lambd_max, num=n_lambd_vals)

diff --git a/ya_pca/rank_selection/wold_cv.py b/ya_pca/rank_selection/wold_cv.py
@@ -3,15 +3,14 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 
-
 from ya_pca.svd_missing import svd_missing
 from ya_pca.linalg_utils import rand_orthog
 
 
 def select_rank_wold_cv(X, max_rank, p_holdout=0.3, opt_kws={}, n_folds=5,
                         random_state=None, rotate=False):
     """
-    Estimates the PCA rank using the Wold style cross- validation method discussed in (Owen and Perry, 2009)
+    Estimates the PCA rank using the Wold style cross- validation method discussed in (Owen and Perry, 2009).
 
     Parameters
     ----------
@@ -30,7 +29,6 @@ def select_rank_wold_cv(X, max_rank, p_holdout=0.3, opt_kws={}, n_folds=5,
     n_folds: int
         Number of cross-validation folds.
 
-
     random_state: None, int
         Random seed.
 

diff --git a/ya_pca/svd_missing.py b/ya_pca/svd_missing.py
@@ -133,5 +133,3 @@ def censored_lstsq(A, B, M):
         r = T.shape[1]
         T[:, np.arange(r), np.arange(r)] += 1e-6
         return np.squeeze(np.linalg.solve(T, rhs), axis=-1).T
-
-
diff --git a/ya_pca/toy_data.py b/ya_pca/toy_data.py
@@ -9,17 +9,16 @@
 
 
 def rand_factor_model(n_samples=100, n_features=20,
-                      rank=10,
-                      m=1.5,
-                      sigma=1.0, random_state=None):
+                      rank=5, m=1.5, noise_std=1.0, random_state=None):
     """
 
-    X = sqrt(n) UV^T + E
-    U = scores (n_samples x rank) has iid normal entries
-    V = loadings (n_features x rank) is a column orthonormal matrix
-    E (n_samples x n_features) has iid N(0, sigma^2) entries
+    Samples from the matrix factorization model
+
+    X = U diag(svals) V^T + E
+
+    where U,V are orthonormal matrices. The svals are linearly increasing following the Choi et al. 2017 section 2.2.3. The noise matrix has iid N(0, noise_std^2) entries.
+
 
-    Follows Choi et al. 2017 section 2.2.3.
     Parameters
     ----------
     n_samples: int
@@ -34,7 +33,7 @@ def rand_factor_model(n_samples=100, n_features=20,
     m: float
         Signal strength.
 
-    sigma: float
+    noise_std: float
         Standard deviation of the noise.
 
     random_state: None, int
@@ -57,10 +56,13 @@ def rand_factor_model(n_samples=100, n_features=20,
     U = rand_orthog(n_samples, rank, random_state=rng)
     V = rand_orthog(n_features, rank, random_state=rng)
 
-    svals = m * np.arange(1, rank + 1) * sigma * (n_samples * n_features) ** (.25)
-    svals = np.sort(svals)[::-1]
+    svals = lin_spaced_svals(n_samples=n_samples,
+                             n_features=n_features,
+                             rank=rank,
+                             sigma_sq=noise_std ** 2,
+                             m=m)
 
-    E = rng.normal(size=(n_samples, n_features), scale=sigma)
+    E = rng.normal(size=(n_samples, n_features), scale=noise_std)
 
     X = (U * svals).dot(V.T) + E
 
@@ -71,10 +73,43 @@ def rand_factor_model(n_samples=100, n_features=20,
                'E': E}
 
 
+def lin_spaced_svals(n_samples, n_features, rank, sigma_sq=1, m=1.5):
+    """
+    Linearly spaced singular values from equation (2.15) of (Choi et al, 2017).
+
+    Parameters
+    ----------
+    n_samples: int
+        Number of samples.
+
+    n_features: int
+        Number of features.
+
+    rank: int
+        The rank of the low rank signal matrix.
+
+    sigma_sq: float
+        Noise level.
+
+    m: float
+        Signal strength. For m<1 we do not expect to be able to find the signal.
+
+    Output
+    ------
+    svals: list of floats, (rank, )
+        The singular values in decreasing order.
+    """
+
+    sval_base = m * np.sqrt(sigma_sq) * (n_samples * n_features) ** (.25)
+    svals = np.arange(1, rank + 1) * sval_base
+    svals = np.sort(svals)[::-1]
+    return svals
+
+
 def perry_sim_dist(strong=True, sparse=False, noise='white',
                    random_state=None):
     """
-    Toy PCA data example from Section 5.4 of (Perry, 2009)
+    Toy PCA data example from simulations in Section 5.4 of (Perry, 2009).
 
     Parameters
     ----------
Original file line number	Diff line number	Diff line change
Expand Up		@@ -133,5 +133,3 @@ def censored_lstsq(A, B, M):
		r = T.shape[1]
		T[:, np.arange(r), np.arange(r)] += 1e-6
		return np.squeeze(np.linalg.solve(T, rhs), axis=-1).T