From 3a905f1fdd7e4ea5d1c85f77bac1e30bd2ac716d Mon Sep 17 00:00:00 2001 From: Armaan Abraham Date: Fri, 2 Feb 2024 16:35:16 -0800 Subject: [PATCH] fix tests --- ddmc/clustering.py | 95 +++++++++++++++++---------------- ddmc/datasets.py | 39 ++++++++++++-- ddmc/figures/figureM2.py | 2 +- ddmc/pam250.py | 17 ++---- ddmc/tests/test_CoClustering.py | 38 ------------- ddmc/tests/test_DifClusters.py | 30 ----------- ddmc/tests/test_cluster.py | 64 ++++++++++++++++++++++ 7 files changed, 153 insertions(+), 132 deletions(-) delete mode 100644 ddmc/tests/test_CoClustering.py delete mode 100644 ddmc/tests/test_DifClusters.py create mode 100644 ddmc/tests/test_cluster.py diff --git a/ddmc/clustering.py b/ddmc/clustering.py index 2d2770c3..388ee742 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -1,6 +1,6 @@ """ Clustering functions. """ -from typing import Literal, List, Dict +from typing import Literal, List, Sequence, Tuple import warnings from copy import deepcopy import itertools @@ -39,7 +39,6 @@ def __init__( self.seq_weight = seq_weight def gen_peptide_distances(self, sequences: np.ndarray, distance_method): - # store parameters for sklearn's checks if sequences.dtype != str: sequences = sequences.astype("str") sequences = np.char.upper(sequences) @@ -119,31 +118,6 @@ def fit(self, p_signal: pd.DataFrame): assert np.all(np.isfinite(self.seq_scores_)) return self - def wins(self, X): - """Find similarity of fitted model to data and sequence models""" - check_is_fitted(self, ["scores_", "seq_scores_"]) - - alt_model = deepcopy(self) - alt_model.seq_weight = 0.0 # No influence - alt_model.fit(X) - data_model = alt_model.scores_ - - alt_model.seq_weight = 1000.0 # Overwhelming influence - alt_model.fit(X) - seq_model = alt_model.scores_ - - dataDist = np.linalg.norm(self.scores_ - data_model) - seqDist = np.linalg.norm(self.scores_ - seq_model) - - for i in itertools.permutations(np.arange(self.n_components)): - dataDistTemp = np.linalg.norm(self.scores_ - data_model[:, i]) - seqDistTemp = np.linalg.norm(self.scores_ - seq_model[:, i]) - - dataDist = np.minimum(dataDist, dataDistTemp) - seqDist = np.minimum(seqDist, seqDistTemp) - - return (dataDist, seqDist) - def transform(self, as_df=False) -> np.ndarray | pd.DataFrame: """ Return cluster centers. @@ -164,23 +138,38 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame: ) return centers - def impute(self, X: np.ndarray) -> np.ndarray: - """Impute a matching dataset.""" - X = X.copy() + def impute(self) -> pd.DataFrame: + """ + Imputes missing values in the dataset passed in fit() and returns the + imputed dataset. + """ + p_signal = self.p_signal.copy() labels = self.labels() # cluster assignments centers = self.transform() # samples x clusters + for ii in range(p_signal.shape[0]): + p_signal[ii, np.isnan(p_signal[ii, :])] = centers[ + np.isnan(p_signal[ii, :]), labels[ii] - 1 + ] + assert np.all(np.isfinite(p_signal)) + return p_signal - assert len(labels) == X.shape[0] - for ii in range(X.shape[0]): # X is peptides x samples - X[ii, np.isnan(X[ii, :])] = centers[np.isnan(X[ii, :]), labels[ii] - 1] - - assert np.all(np.isfinite(X)) - return X - - def get_pssms(self, PsP_background=False, clusters: List = None): - """Compute position-specific scoring matrix of each cluster. + def get_pssms( + self, PsP_background=False, clusters: List[int] = None + ) -> Tuple[np.ndarray, np.ndarray] | np.ndarray: + """ + Compute position-specific scoring matrix of each cluster. Note, to normalize by amino acid frequency this uses either all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus. + + Args: + PsP_background: Whether or not PhosphoSitePlus should be used for background frequency. + clusters: cluster indices to get pssms for + + Returns: + If the clusters argument is used, an array of shape (len(clusters), 20, 11), + else two arrays, where the first (of shape (n_pssms,)) + contains the clusters of the pssms in the second + (of shape (n_pssms, 20, 11)). """ pssm_names, pssms = [], [] if PsP_background: @@ -250,12 +239,11 @@ def get_pssms(self, PsP_background=False, clusters: List = None): def predict_upstream_kinases( self, PsP_background=True, - ): + ) -> np.ndarray: """Compute matrix-matrix similarity between kinase specificity profiles and cluster PSSMs to identify upstream kinases regulating clusters.""" kinases, pspls = get_pspls() clusters, pssms = self.get_pssms(PsP_background=PsP_background) - distances = get_pspl_pssm_distances( pspls, pssms, @@ -263,19 +251,21 @@ def predict_upstream_kinases( pssm_names=clusters, kinases=kinases, ) - return distances - def has_empty_clusters(self): + def has_empty_clusters(self) -> bool: + """ + Checks whether the most recent call to fit() resulted in empty clusters. + """ check_is_fitted(self, ["scores_"]) return np.unique(self.labels()).size != self.n_components - def predict(self) -> np.ndarray: + def predict(self) -> np.ndarray[int]: """Provided the current model parameters, predict the cluster each peptide belongs to.""" check_is_fitted(self, ["scores_"]) return np.argmax(self.scores_, axis=1) - def labels(self) -> np.ndarray: + def labels(self) -> np.ndarray[int]: """Find cluster assignment with highest likelihood for each peptide.""" return self.predict() @@ -286,12 +276,25 @@ def score(self) -> float: def get_pspl_pssm_distances( - pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None + pspls: np.ndarray, + pssms: np.ndarray, + as_df=False, + pssm_names: Sequence[str] = None, + kinases: Sequence[str] = None, ) -> np.ndarray | pd.DataFrame: """ + Computes a distance matrix between PSPLs and PSSMs. + Args: pspls: kinase specificity profiles of shape (n_kinase, 20, 9) pssms: position-specific scoring matrices of shape (n_pssms, 20, 11) + as_df: Whether or not the returned matrix should be returned as a + dataframe. Requires pssm_names and kinases. + pssm_names: list of names for the pssms of shape (n_pssms,) + kinases: list of names for the pspls of shape (n_kinase,) + + Returns: + Distance matrix of shape (n_kinase, n_pssms). """ assert pssms.shape[1:3] == (20, 11) assert pspls.shape[1:3] == (20, 9) diff --git a/ddmc/datasets.py b/ddmc/datasets.py index 92df38d8..9284de64 100644 --- a/ddmc/datasets.py +++ b/ddmc/datasets.py @@ -16,15 +16,31 @@ def filter_incomplete_peptides( min_experiments: int = None, sample_to_experiment: np.ndarray = None, ): + """ + Filters out missing values from p-signal array. + + Args: + sample_presence_ratio: the minimum fraction of non-missing values + allowed for a peptide before it is DROPPED. + min_experiments: the minimum number of experiments allowed for a peptide + before it is DROPPED. Must also pass in sample_to_experiment. + sample_to_experiment: array of shape `len(p_signal.columns)` that maps + each sample to an experiment (any identifier). + + Returns: + Filtered data. + """ # assume that X has sequences as the index and samples as columns if sample_presence_ratio is not None: peptide_idx = ( np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1] >= sample_presence_ratio ) - else: + elif min_experiments is not None: assert min_experiments is not None assert sample_to_experiment is not None + # this is kind of confusing because of the use of numpy, but we're + # removing rows that have less than the minimum number of experiments unique_experiments = np.unique(sample_to_experiment) experiments_grid, s_to_e_grid = np.meshgrid( unique_experiments, sample_to_experiment, indexing="ij" @@ -34,12 +50,19 @@ def filter_incomplete_peptides( peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum( axis=0 ) >= min_experiments + else: + raise ValueError( + "Must specify either a sample presence or n_experiments threshold" + ) return p_signal.iloc[peptide_idx] def select_peptide_subset( p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None ): + """ + Selects a random subset of peptides from p_signal. + """ if keep_ratio is not None: keep_num = int(p_signal.shape[0] * keep_ratio) return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)] @@ -66,7 +89,10 @@ def get_p_signal(self) -> pd.DataFrame: sample_to_experiment=self.get_sample_to_experiment(), ) - def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]): + def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]) -> np.ndarray: + """ + Get patients that have both NAT and tumor samples. + """ samples = samples.astype(str) samples = samples[np.char.find(samples, "IR") == -1] tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")]) @@ -75,7 +101,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]): nat_patients = np.char.replace(nat_samples, ".N", "") return np.intersect1d(tumor_patients, nat_patients) - def get_mutations(self, mutation_names: Sequence[str] = None): + def get_mutations(self, mutation_names: Sequence[str] = None) -> pd.DataFrame: mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv") mutations = mutations.set_index("Sample.ID") patients = self.get_patients_with_nat_and_tumor(mutations.index.values) @@ -84,7 +110,7 @@ def get_mutations(self, mutation_names: Sequence[str] = None): mutations = mutations[mutation_names] return mutations.astype(bool) - def get_hot_cold_labels(self): + def get_hot_cold_labels(self) -> pd.Series: hot_cold = ( pd.read_csv(self.data_dir / "Hot_Cold.csv") .dropna(axis=1) @@ -99,13 +125,16 @@ def get_hot_cold_labels(self): return np.squeeze(hot_cold).astype(bool) def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]: + """ + Get tumor vs NAT for each of samples. Returned array contains True if + tumor. + """ return ~np.array([sample.endswith(".N") for sample in samples]) # MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020) class EBDT: def get_p_signal(self) -> pd.DataFrame: - """Preprocess""" p_signal = ( pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv") .drop("FDR", axis=1) diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py index 9f9d58b4..6824caa9 100644 --- a/ddmc/figures/figureM2.py +++ b/ddmc/figures/figureM2.py @@ -196,5 +196,5 @@ def impute_ddmc(p_signal, n_clusters, weight, distance_method): return ( DDMC(n_clusters, weight, distance_method, max_iter=1) .fit(p_signal) - .impute(p_signal) + .impute() ) diff --git a/ddmc/pam250.py b/ddmc/pam250.py index 309a6285..607172ec 100644 --- a/ddmc/pam250.py +++ b/ddmc/pam250.py @@ -7,7 +7,7 @@ class PAM250: def __init__(self, seqs: list[str]): # Compute all pairwise distances - self.background = MotifPam250Scores(seqs) + self.background = get_pam250_scores(seqs) self.logWeights = 0.0 def from_summaries(self, weightsIn: np.ndarray): @@ -17,7 +17,7 @@ def from_summaries(self, weightsIn: np.ndarray): self.logWeights = (self.background @ weightsIn) / sums -def MotifPam250Scores(seqs: list[str]) -> np.ndarray: +def get_pam250_scores(seqs: list[str]) -> np.ndarray: """Calculate and store all pairwise pam250 distances before starting.""" pam250 = substitution_matrices.load("PAM250") seqs = np.array( @@ -27,17 +27,10 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray: # convert to np array pam250m = np.array(pam250.values(), dtype=np.int8).reshape(pam250.shape) - out = distanceCalc(seqs, pam250m) - - i_upper = np.triu_indices_from(out, k=1) - out[i_upper] = out.T[i_upper] # pylint: disable=unsubscriptable-object - return out - - -def distanceCalc(seqs, pam250m: np.ndarray): - """Calculate all the pairwise distances.""" - # WARNING this type can only hold -128 to 127 out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8) i_idx, j_idx = np.tril_indices(seqs.shape[0]) out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1) + + i_upper = np.triu_indices_from(out, k=1) + out[i_upper] = out.T[i_upper] # pylint: disable=unsubscriptable-object return out diff --git a/ddmc/tests/test_CoClustering.py b/ddmc/tests/test_CoClustering.py deleted file mode 100644 index eaa56d6f..00000000 --- a/ddmc/tests/test_CoClustering.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Testing file for the clustering methods by data and sequence. -""" - -import pytest -import numpy as np -import pandas as pd -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides - - -X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] -X = filter_NaNpeptides(X, tmt=25) -data = X.select_dtypes(include=["float64"]).T -seqs = X["Sequence"] - - -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_wins(distance_method): - """Test that EMclustering is working by comparing with GMM clusters.""" - MSC = DDMC(seqs, 2, seq_weight=0, distance_method=distance_method).fit(X=data) - distances = MSC.wins(data) - - # assert that the distance to the same sequence weight is less - assert distances[0] < 1.0 - assert distances[0] < distances[1] - - -@pytest.mark.parametrize("w", [0, 0.1, 10.0]) -@pytest.mark.parametrize("ncl", [2, 5, 25]) -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_clusters(w, ncl, distance_method): - """Test that EMclustering is working by comparing with GMM clusters.""" - MSC = DDMC(seqs, ncl, seq_weight=w, distance_method=distance_method).fit(X=data) - - # Assert that we got a reasonable result - assert np.all(np.isfinite(MSC.scores_)) - assert np.all(np.isfinite(MSC.seq_scores_)) diff --git a/ddmc/tests/test_DifClusters.py b/ddmc/tests/test_DifClusters.py deleted file mode 100644 index 69b4515d..00000000 --- a/ddmc/tests/test_DifClusters.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -Testing file for minimum variance across clusters using CPTAC data. -""" - -import pytest -import numpy as np -import pandas as pd -from scipy.spatial.distance import cdist -from ..clustering import DDMC -from ..pre_processing import filter_NaNpeptides - -X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:] -X = filter_NaNpeptides(X, tmt=25) -d = X.select_dtypes(include=["float64"]).T - - -@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) -def test_ClusterVar(distance_method): - """Test minimum variance of output cluster centers""" - model = DDMC(X["Sequence"], 6, seq_weight=3, distance_method=distance_method).fit( - X=d - ) - centers = model.transform() - - # Get pairwise cluster distances - dists = cdist(centers.T, centers.T) - np.fill_diagonal(dists, 10.0) - - # Assert that all the clusters are at least euclidean distance 1 away - assert np.all(dists > 1.0) diff --git a/ddmc/tests/test_cluster.py b/ddmc/tests/test_cluster.py new file mode 100644 index 00000000..2f3e9aab --- /dev/null +++ b/ddmc/tests/test_cluster.py @@ -0,0 +1,64 @@ +""" +Testing file for the clustering methods by data and sequence. +""" + +import pytest +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.mixture import GaussianMixture + +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, filter_incomplete_peptides + + +def test_wins(): + """Test that EMclustering is working by comparing with GMM clusters.""" + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + model_ddmc = DDMC(n_components=2, seq_weight=0).fit(p_signal) + model_gmm = GaussianMixture(n_components=2).fit(p_signal.values) + + similarity = cosine_similarity(model_gmm.means_, model_ddmc.transform().T) + + # only works for 2 clusters, check that the two clusters are matched up + # either index-matching or otherwise + diag = np.eye(2, dtype=bool) + offdiag = ~diag + + assert np.all(similarity[diag] > 0.95) or np.all(similarity[offdiag] > 0.95) + + +@pytest.mark.parametrize("w", [0, 0.1, 10.0]) +@pytest.mark.parametrize("ncl", [2, 5, 25]) +@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) +def test_clusters(w, ncl, distance_method): + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + model = DDMC(ncl, seq_weight=w, distance_method=distance_method).fit(p_signal) + + # Assert that we got a reasonable result + assert np.all(np.isfinite(model.scores_)) + assert np.all(np.isfinite(model.seq_scores_)) + + +@pytest.mark.parametrize("distance_method", ["PAM250", "Binomial"]) +def test_ClusterVar(distance_method): + """Test minimum variance of output cluster centers""" + p_signal = filter_incomplete_peptides( + CPTAC().get_p_signal(), sample_presence_ratio=1 + ) + + model = DDMC(n_components=6, seq_weight=3, distance_method=distance_method).fit( + p_signal + ) + centers = model.transform() + + # Get pairwise cluster distances + dists = cdist(centers.T, centers.T) + np.fill_diagonal(dists, 10.0) + + # Assert that all the clusters are at least euclidean distance 1 away + assert np.all(dists > 1.0)