fix tests

meyer-lab · Feb 3, 2024 · 3a905f1 · 3a905f1
1 parent f6a0b34
commit 3a905f1
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 132 deletions.
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -1,6 +1,6 @@
 """ Clustering functions. """
 
-from typing import Literal, List, Dict
+from typing import Literal, List, Sequence, Tuple
 import warnings
 from copy import deepcopy
 import itertools
@@ -39,7 +39,6 @@ def __init__(
         self.seq_weight = seq_weight
 
     def gen_peptide_distances(self, sequences: np.ndarray, distance_method):
-        # store parameters for sklearn's checks
         if sequences.dtype != str:
             sequences = sequences.astype("str")
         sequences = np.char.upper(sequences)
@@ -119,31 +118,6 @@ def fit(self, p_signal: pd.DataFrame):
         assert np.all(np.isfinite(self.seq_scores_))
         return self
 
-    def wins(self, X):
-        """Find similarity of fitted model to data and sequence models"""
-        check_is_fitted(self, ["scores_", "seq_scores_"])
-
-        alt_model = deepcopy(self)
-        alt_model.seq_weight = 0.0  # No influence
-        alt_model.fit(X)
-        data_model = alt_model.scores_
-
-        alt_model.seq_weight = 1000.0  # Overwhelming influence
-        alt_model.fit(X)
-        seq_model = alt_model.scores_
-
-        dataDist = np.linalg.norm(self.scores_ - data_model)
-        seqDist = np.linalg.norm(self.scores_ - seq_model)
-
-        for i in itertools.permutations(np.arange(self.n_components)):
-            dataDistTemp = np.linalg.norm(self.scores_ - data_model[:, i])
-            seqDistTemp = np.linalg.norm(self.scores_ - seq_model[:, i])
-
-            dataDist = np.minimum(dataDist, dataDistTemp)
-            seqDist = np.minimum(seqDist, seqDistTemp)
-
-        return (dataDist, seqDist)
-
     def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
         """
         Return cluster centers.
@@ -164,23 +138,38 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
             )
         return centers
 
-    def impute(self, X: np.ndarray) -> np.ndarray:
-        """Impute a matching dataset."""
-        X = X.copy()
+    def impute(self) -> pd.DataFrame:
+        """
+        Imputes missing values in the dataset passed in fit() and returns the
+        imputed dataset.
+        """
+        p_signal = self.p_signal.copy()
         labels = self.labels()  # cluster assignments
         centers = self.transform()  # samples x clusters
+        for ii in range(p_signal.shape[0]): 
+            p_signal[ii, np.isnan(p_signal[ii, :])] = centers[
+                np.isnan(p_signal[ii, :]), labels[ii] - 1
+            ]
+        assert np.all(np.isfinite(p_signal))
+        return p_signal
 
-        assert len(labels) == X.shape[0]
-        for ii in range(X.shape[0]):  # X is peptides x samples
-            X[ii, np.isnan(X[ii, :])] = centers[np.isnan(X[ii, :]), labels[ii] - 1]
-
-        assert np.all(np.isfinite(X))
-        return X
-
-    def get_pssms(self, PsP_background=False, clusters: List = None):
-        """Compute position-specific scoring matrix of each cluster.
+    def get_pssms(
+        self, PsP_background=False, clusters: List[int] = None
+    ) -> Tuple[np.ndarray, np.ndarray] | np.ndarray:
+        """
+        Compute position-specific scoring matrix of each cluster.
         Note, to normalize by amino acid frequency this uses either
         all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
+
+        Args:
+            PsP_background: Whether or not PhosphoSitePlus should be used for background frequency.
+            clusters: cluster indices to get pssms for
+
+        Returns:
+            If the clusters argument is used, an array of shape (len(clusters), 20, 11),
+            else two arrays, where the first (of shape (n_pssms,))
+            contains the clusters of the pssms in the second
+            (of shape (n_pssms, 20, 11)).
         """
         pssm_names, pssms = [], []
         if PsP_background:
@@ -250,32 +239,33 @@ def get_pssms(self, PsP_background=False, clusters: List = None):
     def predict_upstream_kinases(
         self,
         PsP_background=True,
-    ):
+    ) -> np.ndarray:
         """Compute matrix-matrix similarity between kinase specificity profiles
         and cluster PSSMs to identify upstream kinases regulating clusters."""
         kinases, pspls = get_pspls()
         clusters, pssms = self.get_pssms(PsP_background=PsP_background)
-
         distances = get_pspl_pssm_distances(
             pspls,
             pssms,
             as_df=True,
             pssm_names=clusters,
             kinases=kinases,
         )
-
         return distances
 
-    def has_empty_clusters(self):
+    def has_empty_clusters(self) -> bool:
+        """
+        Checks whether the most recent call to fit() resulted in empty clusters.
+        """
         check_is_fitted(self, ["scores_"])
         return np.unique(self.labels()).size != self.n_components
 
-    def predict(self) -> np.ndarray:
+    def predict(self) -> np.ndarray[int]:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""
         check_is_fitted(self, ["scores_"])
         return np.argmax(self.scores_, axis=1)
 
-    def labels(self) -> np.ndarray:
+    def labels(self) -> np.ndarray[int]:
         """Find cluster assignment with highest likelihood for each peptide."""
         return self.predict()
 
@@ -286,12 +276,25 @@ def score(self) -> float:
 
 
 def get_pspl_pssm_distances(
-    pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None
+    pspls: np.ndarray,
+    pssms: np.ndarray,
+    as_df=False,
+    pssm_names: Sequence[str] = None,
+    kinases: Sequence[str] = None,
 ) -> np.ndarray | pd.DataFrame:
     """
+    Computes a distance matrix between PSPLs and PSSMs.
+
     Args:
         pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
         pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
+        as_df: Whether or not the returned matrix should be returned as a
+            dataframe. Requires pssm_names and kinases.
+        pssm_names: list of names for the pssms of shape (n_pssms,)
+        kinases: list of names for the pspls of shape (n_kinase,)
+
+    Returns:
+        Distance matrix of shape (n_kinase, n_pssms).
     """
     assert pssms.shape[1:3] == (20, 11)
     assert pspls.shape[1:3] == (20, 9)

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
@@ -16,15 +16,31 @@ def filter_incomplete_peptides(
     min_experiments: int = None,
     sample_to_experiment: np.ndarray = None,
 ):
+    """
+    Filters out missing values from p-signal array.
+
+    Args:
+        sample_presence_ratio: the minimum fraction of non-missing values
+            allowed for a peptide before it is DROPPED.
+        min_experiments: the minimum number of experiments allowed for a peptide
+            before it is DROPPED. Must also pass in sample_to_experiment.
+        sample_to_experiment: array of shape `len(p_signal.columns)` that maps
+            each sample to an experiment (any identifier).
+
+    Returns:
+        Filtered data.
+    """
     # assume that X has sequences as the index and samples as columns
     if sample_presence_ratio is not None:
         peptide_idx = (
             np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1]
             >= sample_presence_ratio
         )
-    else:
+    elif min_experiments is not None:
         assert min_experiments is not None
         assert sample_to_experiment is not None
+        # this is kind of confusing because of the use of numpy, but we're
+        # removing rows that have less than the minimum number of experiments
         unique_experiments = np.unique(sample_to_experiment)
         experiments_grid, s_to_e_grid = np.meshgrid(
             unique_experiments, sample_to_experiment, indexing="ij"
@@ -34,12 +50,19 @@ def filter_incomplete_peptides(
         peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
             axis=0
         ) >= min_experiments
+    else:
+        raise ValueError(
+            "Must specify either a sample presence or n_experiments threshold"
+        )
     return p_signal.iloc[peptide_idx]
 
 
 def select_peptide_subset(
     p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None
 ):
+    """
+    Selects a random subset of peptides from p_signal.
+    """
     if keep_ratio is not None:
         keep_num = int(p_signal.shape[0] * keep_ratio)
     return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)]
@@ -66,7 +89,10 @@ def get_p_signal(self) -> pd.DataFrame:
             sample_to_experiment=self.get_sample_to_experiment(),
         )
 
-    def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
+    def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]) -> np.ndarray:
+        """
+        Get patients that have both NAT and tumor samples.
+        """
         samples = samples.astype(str)
         samples = samples[np.char.find(samples, "IR") == -1]
         tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")])
@@ -75,7 +101,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
         nat_patients = np.char.replace(nat_samples, ".N", "")
         return np.intersect1d(tumor_patients, nat_patients)
 
-    def get_mutations(self, mutation_names: Sequence[str] = None):
+    def get_mutations(self, mutation_names: Sequence[str] = None) -> pd.DataFrame:
         mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
         mutations = mutations.set_index("Sample.ID")
         patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
@@ -84,7 +110,7 @@ def get_mutations(self, mutation_names: Sequence[str] = None):
             mutations = mutations[mutation_names]
         return mutations.astype(bool)
 
-    def get_hot_cold_labels(self):
+    def get_hot_cold_labels(self) -> pd.Series:
         hot_cold = (
             pd.read_csv(self.data_dir / "Hot_Cold.csv")
             .dropna(axis=1)
@@ -99,13 +125,16 @@ def get_hot_cold_labels(self):
         return np.squeeze(hot_cold).astype(bool)
 
     def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
+        """
+        Get tumor vs NAT for each of samples. Returned array contains True if
+        tumor.
+        """
         return ~np.array([sample.endswith(".N") for sample in samples])
 
 
 # MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)
 class EBDT:
     def get_p_signal(self) -> pd.DataFrame:
-        """Preprocess"""
         p_signal = (
             pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv")
             .drop("FDR", axis=1)

diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
@@ -196,5 +196,5 @@ def impute_ddmc(p_signal, n_clusters, weight, distance_method):
     return (
         DDMC(n_clusters, weight, distance_method, max_iter=1)
         .fit(p_signal)
-        .impute(p_signal)
+        .impute()
     )
diff --git a/ddmc/pam250.py b/ddmc/pam250.py
@@ -7,7 +7,7 @@
 class PAM250:
     def __init__(self, seqs: list[str]):
         # Compute all pairwise distances
-        self.background = MotifPam250Scores(seqs)
+        self.background = get_pam250_scores(seqs)
         self.logWeights = 0.0
 
     def from_summaries(self, weightsIn: np.ndarray):
@@ -17,7 +17,7 @@ def from_summaries(self, weightsIn: np.ndarray):
         self.logWeights = (self.background @ weightsIn) / sums
 
 
-def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
+def get_pam250_scores(seqs: list[str]) -> np.ndarray:
     """Calculate and store all pairwise pam250 distances before starting."""
     pam250 = substitution_matrices.load("PAM250")
     seqs = np.array(
@@ -27,17 +27,10 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
     # convert to np array
     pam250m = np.array(pam250.values(), dtype=np.int8).reshape(pam250.shape)
 
-    out = distanceCalc(seqs, pam250m)
-
-    i_upper = np.triu_indices_from(out, k=1)
-    out[i_upper] = out.T[i_upper]  # pylint: disable=unsubscriptable-object
-    return out
-
-
-def distanceCalc(seqs, pam250m: np.ndarray):
-    """Calculate all the pairwise distances."""
-    # WARNING this type can only hold -128 to 127
     out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8)
     i_idx, j_idx = np.tril_indices(seqs.shape[0])
     out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1)
+
+    i_upper = np.triu_indices_from(out, k=1)
+    out[i_upper] = out.T[i_upper]  # pylint: disable=unsubscriptable-object
     return out
diff --git a/ddmc/tests/test_CoClustering.py b/ddmc/tests/test_CoClustering.py
diff --git a/ddmc/tests/test_DifClusters.py b/ddmc/tests/test_DifClusters.py