From 3a905f1fdd7e4ea5d1c85f77bac1e30bd2ac716d Mon Sep 17 00:00:00 2001
From: Armaan Abraham <armaanabraham@g.ucla.edu>
Date: Fri, 2 Feb 2024 16:35:16 -0800
Subject: [PATCH] fix tests

---
 ddmc/clustering.py              | 95 +++++++++++++++++----------------
 ddmc/datasets.py                | 39 ++++++++++++--
 ddmc/figures/figureM2.py        |  2 +-
 ddmc/pam250.py                  | 17 ++----
 ddmc/tests/test_CoClustering.py | 38 -------------
 ddmc/tests/test_DifClusters.py  | 30 -----------
 ddmc/tests/test_cluster.py      | 64 ++++++++++++++++++++++
 7 files changed, 153 insertions(+), 132 deletions(-)
 delete mode 100644 ddmc/tests/test_CoClustering.py
 delete mode 100644 ddmc/tests/test_DifClusters.py
 create mode 100644 ddmc/tests/test_cluster.py

diff --git a/ddmc/clustering.py b/ddmc/clustering.py
index 2d2770c3..388ee742 100644
--- a/ddmc/clustering.py
+++ b/ddmc/clustering.py
@@ -1,6 +1,6 @@
 """ Clustering functions. """
 
-from typing import Literal, List, Dict
+from typing import Literal, List, Sequence, Tuple
 import warnings
 from copy import deepcopy
 import itertools
@@ -39,7 +39,6 @@ def __init__(
         self.seq_weight = seq_weight
 
     def gen_peptide_distances(self, sequences: np.ndarray, distance_method):
-        # store parameters for sklearn's checks
         if sequences.dtype != str:
             sequences = sequences.astype("str")
         sequences = np.char.upper(sequences)
@@ -119,31 +118,6 @@ def fit(self, p_signal: pd.DataFrame):
         assert np.all(np.isfinite(self.seq_scores_))
         return self
 
-    def wins(self, X):
-        """Find similarity of fitted model to data and sequence models"""
-        check_is_fitted(self, ["scores_", "seq_scores_"])
-
-        alt_model = deepcopy(self)
-        alt_model.seq_weight = 0.0  # No influence
-        alt_model.fit(X)
-        data_model = alt_model.scores_
-
-        alt_model.seq_weight = 1000.0  # Overwhelming influence
-        alt_model.fit(X)
-        seq_model = alt_model.scores_
-
-        dataDist = np.linalg.norm(self.scores_ - data_model)
-        seqDist = np.linalg.norm(self.scores_ - seq_model)
-
-        for i in itertools.permutations(np.arange(self.n_components)):
-            dataDistTemp = np.linalg.norm(self.scores_ - data_model[:, i])
-            seqDistTemp = np.linalg.norm(self.scores_ - seq_model[:, i])
-
-            dataDist = np.minimum(dataDist, dataDistTemp)
-            seqDist = np.minimum(seqDist, seqDistTemp)
-
-        return (dataDist, seqDist)
-
     def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
         """
         Return cluster centers.
@@ -164,23 +138,38 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
             )
         return centers
 
-    def impute(self, X: np.ndarray) -> np.ndarray:
-        """Impute a matching dataset."""
-        X = X.copy()
+    def impute(self) -> pd.DataFrame:
+        """
+        Imputes missing values in the dataset passed in fit() and returns the
+        imputed dataset.
+        """
+        p_signal = self.p_signal.copy()
         labels = self.labels()  # cluster assignments
         centers = self.transform()  # samples x clusters
+        for ii in range(p_signal.shape[0]): 
+            p_signal[ii, np.isnan(p_signal[ii, :])] = centers[
+                np.isnan(p_signal[ii, :]), labels[ii] - 1
+            ]
+        assert np.all(np.isfinite(p_signal))
+        return p_signal
 
-        assert len(labels) == X.shape[0]
-        for ii in range(X.shape[0]):  # X is peptides x samples
-            X[ii, np.isnan(X[ii, :])] = centers[np.isnan(X[ii, :]), labels[ii] - 1]
-
-        assert np.all(np.isfinite(X))
-        return X
-
-    def get_pssms(self, PsP_background=False, clusters: List = None):
-        """Compute position-specific scoring matrix of each cluster.
+    def get_pssms(
+        self, PsP_background=False, clusters: List[int] = None
+    ) -> Tuple[np.ndarray, np.ndarray] | np.ndarray:
+        """
+        Compute position-specific scoring matrix of each cluster.
         Note, to normalize by amino acid frequency this uses either
         all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
+
+        Args:
+            PsP_background: Whether or not PhosphoSitePlus should be used for background frequency.
+            clusters: cluster indices to get pssms for
+
+        Returns:
+            If the clusters argument is used, an array of shape (len(clusters), 20, 11),
+            else two arrays, where the first (of shape (n_pssms,))
+            contains the clusters of the pssms in the second
+            (of shape (n_pssms, 20, 11)).
         """
         pssm_names, pssms = [], []
         if PsP_background:
@@ -250,12 +239,11 @@ def get_pssms(self, PsP_background=False, clusters: List = None):
     def predict_upstream_kinases(
         self,
         PsP_background=True,
-    ):
+    ) -> np.ndarray:
         """Compute matrix-matrix similarity between kinase specificity profiles
         and cluster PSSMs to identify upstream kinases regulating clusters."""
         kinases, pspls = get_pspls()
         clusters, pssms = self.get_pssms(PsP_background=PsP_background)
-
         distances = get_pspl_pssm_distances(
             pspls,
             pssms,
@@ -263,19 +251,21 @@ def predict_upstream_kinases(
             pssm_names=clusters,
             kinases=kinases,
         )
-
         return distances
 
-    def has_empty_clusters(self):
+    def has_empty_clusters(self) -> bool:
+        """
+        Checks whether the most recent call to fit() resulted in empty clusters.
+        """
         check_is_fitted(self, ["scores_"])
         return np.unique(self.labels()).size != self.n_components
 
-    def predict(self) -> np.ndarray:
+    def predict(self) -> np.ndarray[int]:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""
         check_is_fitted(self, ["scores_"])
         return np.argmax(self.scores_, axis=1)
 
-    def labels(self) -> np.ndarray:
+    def labels(self) -> np.ndarray[int]:
         """Find cluster assignment with highest likelihood for each peptide."""
         return self.predict()
 
@@ -286,12 +276,25 @@ def score(self) -> float:
 
 
 def get_pspl_pssm_distances(
-    pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None
+    pspls: np.ndarray,
+    pssms: np.ndarray,
+    as_df=False,
+    pssm_names: Sequence[str] = None,
+    kinases: Sequence[str] = None,
 ) -> np.ndarray | pd.DataFrame:
     """
+    Computes a distance matrix between PSPLs and PSSMs.
+
     Args:
         pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
         pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
+        as_df: Whether or not the returned matrix should be returned as a
+            dataframe. Requires pssm_names and kinases.
+        pssm_names: list of names for the pssms of shape (n_pssms,)
+        kinases: list of names for the pspls of shape (n_kinase,)
+
+    Returns:
+        Distance matrix of shape (n_kinase, n_pssms).
     """
     assert pssms.shape[1:3] == (20, 11)
     assert pspls.shape[1:3] == (20, 9)
diff --git a/ddmc/datasets.py b/ddmc/datasets.py
index 92df38d8..9284de64 100644
--- a/ddmc/datasets.py
+++ b/ddmc/datasets.py
@@ -16,15 +16,31 @@ def filter_incomplete_peptides(
     min_experiments: int = None,
     sample_to_experiment: np.ndarray = None,
 ):
+    """
+    Filters out missing values from p-signal array.
+
+    Args:
+        sample_presence_ratio: the minimum fraction of non-missing values
+            allowed for a peptide before it is DROPPED.
+        min_experiments: the minimum number of experiments allowed for a peptide
+            before it is DROPPED. Must also pass in sample_to_experiment.
+        sample_to_experiment: array of shape `len(p_signal.columns)` that maps
+            each sample to an experiment (any identifier).
+
+    Returns:
+        Filtered data.
+    """
     # assume that X has sequences as the index and samples as columns
     if sample_presence_ratio is not None:
         peptide_idx = (
             np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1]
             >= sample_presence_ratio
         )
-    else:
+    elif min_experiments is not None:
         assert min_experiments is not None
         assert sample_to_experiment is not None
+        # this is kind of confusing because of the use of numpy, but we're
+        # removing rows that have less than the minimum number of experiments
         unique_experiments = np.unique(sample_to_experiment)
         experiments_grid, s_to_e_grid = np.meshgrid(
             unique_experiments, sample_to_experiment, indexing="ij"
@@ -34,12 +50,19 @@ def filter_incomplete_peptides(
         peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
             axis=0
         ) >= min_experiments
+    else:
+        raise ValueError(
+            "Must specify either a sample presence or n_experiments threshold"
+        )
     return p_signal.iloc[peptide_idx]
 
 
 def select_peptide_subset(
     p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None
 ):
+    """
+    Selects a random subset of peptides from p_signal.
+    """
     if keep_ratio is not None:
         keep_num = int(p_signal.shape[0] * keep_ratio)
     return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)]
@@ -66,7 +89,10 @@ def get_p_signal(self) -> pd.DataFrame:
             sample_to_experiment=self.get_sample_to_experiment(),
         )
 
-    def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
+    def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]) -> np.ndarray:
+        """
+        Get patients that have both NAT and tumor samples.
+        """
         samples = samples.astype(str)
         samples = samples[np.char.find(samples, "IR") == -1]
         tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")])
@@ -75,7 +101,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
         nat_patients = np.char.replace(nat_samples, ".N", "")
         return np.intersect1d(tumor_patients, nat_patients)
 
-    def get_mutations(self, mutation_names: Sequence[str] = None):
+    def get_mutations(self, mutation_names: Sequence[str] = None) -> pd.DataFrame:
         mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
         mutations = mutations.set_index("Sample.ID")
         patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
@@ -84,7 +110,7 @@ def get_mutations(self, mutation_names: Sequence[str] = None):
             mutations = mutations[mutation_names]
         return mutations.astype(bool)
 
-    def get_hot_cold_labels(self):
+    def get_hot_cold_labels(self) -> pd.Series:
         hot_cold = (
             pd.read_csv(self.data_dir / "Hot_Cold.csv")
             .dropna(axis=1)
@@ -99,13 +125,16 @@ def get_hot_cold_labels(self):
         return np.squeeze(hot_cold).astype(bool)
 
     def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
+        """
+        Get tumor vs NAT for each of samples. Returned array contains True if
+        tumor.
+        """
         return ~np.array([sample.endswith(".N") for sample in samples])
 
 
 # MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)
 class EBDT:
     def get_p_signal(self) -> pd.DataFrame:
-        """Preprocess"""
         p_signal = (
             pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv")
             .drop("FDR", axis=1)
diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
index 9f9d58b4..6824caa9 100644
--- a/ddmc/figures/figureM2.py
+++ b/ddmc/figures/figureM2.py
@@ -196,5 +196,5 @@ def impute_ddmc(p_signal, n_clusters, weight, distance_method):
     return (
         DDMC(n_clusters, weight, distance_method, max_iter=1)
         .fit(p_signal)
-        .impute(p_signal)
+        .impute()
     )
diff --git a/ddmc/pam250.py b/ddmc/pam250.py
index 309a6285..607172ec 100644
--- a/ddmc/pam250.py
+++ b/ddmc/pam250.py
@@ -7,7 +7,7 @@
 class PAM250:
     def __init__(self, seqs: list[str]):
         # Compute all pairwise distances
-        self.background = MotifPam250Scores(seqs)
+        self.background = get_pam250_scores(seqs)
         self.logWeights = 0.0
 
     def from_summaries(self, weightsIn: np.ndarray):
@@ -17,7 +17,7 @@ def from_summaries(self, weightsIn: np.ndarray):
         self.logWeights = (self.background @ weightsIn) / sums
 
 
-def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
+def get_pam250_scores(seqs: list[str]) -> np.ndarray:
     """Calculate and store all pairwise pam250 distances before starting."""
     pam250 = substitution_matrices.load("PAM250")
     seqs = np.array(
@@ -27,17 +27,10 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
     # convert to np array
     pam250m = np.array(pam250.values(), dtype=np.int8).reshape(pam250.shape)
 
-    out = distanceCalc(seqs, pam250m)
-
-    i_upper = np.triu_indices_from(out, k=1)
-    out[i_upper] = out.T[i_upper]  # pylint: disable=unsubscriptable-object
-    return out
-
-
-def distanceCalc(seqs, pam250m: np.ndarray):
-    """Calculate all the pairwise distances."""
-    # WARNING this type can only hold -128 to 127
     out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8)
     i_idx, j_idx = np.tril_indices(seqs.shape[0])
     out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1)
+
+    i_upper = np.triu_indices_from(out, k=1)
+    out[i_upper] = out.T[i_upper]  # pylint: disable=unsubscriptable-object
     return out
diff --git a/ddmc/tests/test_CoClustering.py b/ddmc/tests/test_CoClustering.py
deleted file mode 100644
index eaa56d6f..00000000
--- a/ddmc/tests/test_CoClustering.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
-Testing file for the clustering methods by data and sequence.
-"""
-
-import pytest
-import numpy as np
-import pandas as pd
-from ..clustering import DDMC
-from ..pre_processing import filter_NaNpeptides
-
-
-X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:]
-X = filter_NaNpeptides(X, tmt=25)
-data = X.select_dtypes(include=["float64"]).T
-seqs = X["Sequence"]
-
-
-@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"])
-def test_wins(distance_method):
-    """Test that EMclustering is working by comparing with GMM clusters."""
-    MSC = DDMC(seqs, 2, seq_weight=0, distance_method=distance_method).fit(X=data)
-    distances = MSC.wins(data)
-
-    # assert that the distance to the same sequence weight is less
-    assert distances[0] < 1.0
-    assert distances[0] < distances[1]
-
-
-@pytest.mark.parametrize("w", [0, 0.1, 10.0])
-@pytest.mark.parametrize("ncl", [2, 5, 25])
-@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"])
-def test_clusters(w, ncl, distance_method):
-    """Test that EMclustering is working by comparing with GMM clusters."""
-    MSC = DDMC(seqs, ncl, seq_weight=w, distance_method=distance_method).fit(X=data)
-
-    # Assert that we got a reasonable result
-    assert np.all(np.isfinite(MSC.scores_))
-    assert np.all(np.isfinite(MSC.seq_scores_))
diff --git a/ddmc/tests/test_DifClusters.py b/ddmc/tests/test_DifClusters.py
deleted file mode 100644
index 69b4515d..00000000
--- a/ddmc/tests/test_DifClusters.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""
-Testing file for minimum variance across clusters using CPTAC data.
-"""
-
-import pytest
-import numpy as np
-import pandas as pd
-from scipy.spatial.distance import cdist
-from ..clustering import DDMC
-from ..pre_processing import filter_NaNpeptides
-
-X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:]
-X = filter_NaNpeptides(X, tmt=25)
-d = X.select_dtypes(include=["float64"]).T
-
-
-@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"])
-def test_ClusterVar(distance_method):
-    """Test minimum variance of output cluster centers"""
-    model = DDMC(X["Sequence"], 6, seq_weight=3, distance_method=distance_method).fit(
-        X=d
-    )
-    centers = model.transform()
-
-    # Get pairwise cluster distances
-    dists = cdist(centers.T, centers.T)
-    np.fill_diagonal(dists, 10.0)
-
-    # Assert that all the clusters are at least euclidean distance 1 away
-    assert np.all(dists > 1.0)
diff --git a/ddmc/tests/test_cluster.py b/ddmc/tests/test_cluster.py
new file mode 100644
index 00000000..2f3e9aab
--- /dev/null
+++ b/ddmc/tests/test_cluster.py
@@ -0,0 +1,64 @@
+"""
+Testing file for the clustering methods by data and sequence.
+"""
+
+import pytest
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.mixture import GaussianMixture
+
+from ddmc.clustering import DDMC
+from ddmc.datasets import CPTAC, filter_incomplete_peptides
+
+
+def test_wins():
+    """Test that EMclustering is working by comparing with GMM clusters."""
+    p_signal = filter_incomplete_peptides(
+        CPTAC().get_p_signal(), sample_presence_ratio=1
+    )
+    model_ddmc = DDMC(n_components=2, seq_weight=0).fit(p_signal)
+    model_gmm = GaussianMixture(n_components=2).fit(p_signal.values)
+
+    similarity = cosine_similarity(model_gmm.means_, model_ddmc.transform().T)
+
+    # only works for 2 clusters, check that the two clusters are matched up
+    # either index-matching or otherwise
+    diag = np.eye(2, dtype=bool)
+    offdiag = ~diag
+
+    assert np.all(similarity[diag] > 0.95) or np.all(similarity[offdiag] > 0.95)
+
+
+@pytest.mark.parametrize("w", [0, 0.1, 10.0])
+@pytest.mark.parametrize("ncl", [2, 5, 25])
+@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"])
+def test_clusters(w, ncl, distance_method):
+    p_signal = filter_incomplete_peptides(
+        CPTAC().get_p_signal(), sample_presence_ratio=1
+    )
+    model = DDMC(ncl, seq_weight=w, distance_method=distance_method).fit(p_signal)
+
+    # Assert that we got a reasonable result
+    assert np.all(np.isfinite(model.scores_))
+    assert np.all(np.isfinite(model.seq_scores_))
+
+
+@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"])
+def test_ClusterVar(distance_method):
+    """Test minimum variance of output cluster centers"""
+    p_signal = filter_incomplete_peptides(
+        CPTAC().get_p_signal(), sample_presence_ratio=1
+    )
+
+    model = DDMC(n_components=6, seq_weight=3, distance_method=distance_method).fit(
+        p_signal
+    )
+    centers = model.transform()
+
+    # Get pairwise cluster distances
+    dists = cdist(centers.T, centers.T)
+    np.fill_diagonal(dists, 10.0)
+
+    # Assert that all the clusters are at least euclidean distance 1 away
+    assert np.all(dists > 1.0)