Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 3, 2024
1 parent f6a0b34 commit 3a905f1
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 132 deletions.
95 changes: 49 additions & 46 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Clustering functions. """

from typing import Literal, List, Dict
from typing import Literal, List, Sequence, Tuple
import warnings
from copy import deepcopy
import itertools
Expand Down Expand Up @@ -39,7 +39,6 @@ def __init__(
self.seq_weight = seq_weight

def gen_peptide_distances(self, sequences: np.ndarray, distance_method):
# store parameters for sklearn's checks
if sequences.dtype != str:
sequences = sequences.astype("str")
sequences = np.char.upper(sequences)
Expand Down Expand Up @@ -119,31 +118,6 @@ def fit(self, p_signal: pd.DataFrame):
assert np.all(np.isfinite(self.seq_scores_))
return self

def wins(self, X):
"""Find similarity of fitted model to data and sequence models"""
check_is_fitted(self, ["scores_", "seq_scores_"])

alt_model = deepcopy(self)
alt_model.seq_weight = 0.0 # No influence
alt_model.fit(X)
data_model = alt_model.scores_

alt_model.seq_weight = 1000.0 # Overwhelming influence
alt_model.fit(X)
seq_model = alt_model.scores_

dataDist = np.linalg.norm(self.scores_ - data_model)
seqDist = np.linalg.norm(self.scores_ - seq_model)

for i in itertools.permutations(np.arange(self.n_components)):
dataDistTemp = np.linalg.norm(self.scores_ - data_model[:, i])
seqDistTemp = np.linalg.norm(self.scores_ - seq_model[:, i])

dataDist = np.minimum(dataDist, dataDistTemp)
seqDist = np.minimum(seqDist, seqDistTemp)

return (dataDist, seqDist)

def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
"""
Return cluster centers.
Expand All @@ -164,23 +138,38 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
)
return centers

def impute(self, X: np.ndarray) -> np.ndarray:
"""Impute a matching dataset."""
X = X.copy()
def impute(self) -> pd.DataFrame:
"""
Imputes missing values in the dataset passed in fit() and returns the
imputed dataset.
"""
p_signal = self.p_signal.copy()

Check warning on line 146 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L146

Added line #L146 was not covered by tests
labels = self.labels() # cluster assignments
centers = self.transform() # samples x clusters
for ii in range(p_signal.shape[0]):
p_signal[ii, np.isnan(p_signal[ii, :])] = centers[

Check warning on line 150 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L149-L150

Added lines #L149 - L150 were not covered by tests
np.isnan(p_signal[ii, :]), labels[ii] - 1
]
assert np.all(np.isfinite(p_signal))
return p_signal

Check warning on line 154 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L153-L154

Added lines #L153 - L154 were not covered by tests

assert len(labels) == X.shape[0]
for ii in range(X.shape[0]): # X is peptides x samples
X[ii, np.isnan(X[ii, :])] = centers[np.isnan(X[ii, :]), labels[ii] - 1]

assert np.all(np.isfinite(X))
return X

def get_pssms(self, PsP_background=False, clusters: List = None):
"""Compute position-specific scoring matrix of each cluster.
def get_pssms(
self, PsP_background=False, clusters: List[int] = None
) -> Tuple[np.ndarray, np.ndarray] | np.ndarray:
"""
Compute position-specific scoring matrix of each cluster.
Note, to normalize by amino acid frequency this uses either
all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
Args:
PsP_background: Whether or not PhosphoSitePlus should be used for background frequency.
clusters: cluster indices to get pssms for
Returns:
If the clusters argument is used, an array of shape (len(clusters), 20, 11),
else two arrays, where the first (of shape (n_pssms,))
contains the clusters of the pssms in the second
(of shape (n_pssms, 20, 11)).
"""
pssm_names, pssms = [], []

Check warning on line 174 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L174

Added line #L174 was not covered by tests
if PsP_background:
Expand Down Expand Up @@ -250,32 +239,33 @@ def get_pssms(self, PsP_background=False, clusters: List = None):
def predict_upstream_kinases(
self,
PsP_background=True,
):
) -> np.ndarray:
"""Compute matrix-matrix similarity between kinase specificity profiles
and cluster PSSMs to identify upstream kinases regulating clusters."""
kinases, pspls = get_pspls()
clusters, pssms = self.get_pssms(PsP_background=PsP_background)

distances = get_pspl_pssm_distances(

Check warning on line 247 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L245-L247

Added lines #L245 - L247 were not covered by tests
pspls,
pssms,
as_df=True,
pssm_names=clusters,
kinases=kinases,
)

return distances

Check warning on line 254 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L254

Added line #L254 was not covered by tests

def has_empty_clusters(self):
def has_empty_clusters(self) -> bool:
"""
Checks whether the most recent call to fit() resulted in empty clusters.
"""
check_is_fitted(self, ["scores_"])
return np.unique(self.labels()).size != self.n_components

Check warning on line 261 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L260-L261

Added lines #L260 - L261 were not covered by tests

def predict(self) -> np.ndarray:
def predict(self) -> np.ndarray[int]:
"""Provided the current model parameters, predict the cluster each peptide belongs to."""
check_is_fitted(self, ["scores_"])
return np.argmax(self.scores_, axis=1)

def labels(self) -> np.ndarray:
def labels(self) -> np.ndarray[int]:
"""Find cluster assignment with highest likelihood for each peptide."""
return self.predict()

Check warning on line 270 in ddmc/clustering.py

View check run for this annotation

Codecov / codecov/patch

ddmc/clustering.py#L270

Added line #L270 was not covered by tests

Expand All @@ -286,12 +276,25 @@ def score(self) -> float:


def get_pspl_pssm_distances(
pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None
pspls: np.ndarray,
pssms: np.ndarray,
as_df=False,
pssm_names: Sequence[str] = None,
kinases: Sequence[str] = None,
) -> np.ndarray | pd.DataFrame:
"""
Computes a distance matrix between PSPLs and PSSMs.
Args:
pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
as_df: Whether or not the returned matrix should be returned as a
dataframe. Requires pssm_names and kinases.
pssm_names: list of names for the pssms of shape (n_pssms,)
kinases: list of names for the pspls of shape (n_kinase,)
Returns:
Distance matrix of shape (n_kinase, n_pssms).
"""
assert pssms.shape[1:3] == (20, 11)
assert pspls.shape[1:3] == (20, 9)
Expand Down
39 changes: 34 additions & 5 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,31 @@ def filter_incomplete_peptides(
min_experiments: int = None,
sample_to_experiment: np.ndarray = None,
):
"""
Filters out missing values from p-signal array.
Args:
sample_presence_ratio: the minimum fraction of non-missing values
allowed for a peptide before it is DROPPED.
min_experiments: the minimum number of experiments allowed for a peptide
before it is DROPPED. Must also pass in sample_to_experiment.
sample_to_experiment: array of shape `len(p_signal.columns)` that maps
each sample to an experiment (any identifier).
Returns:
Filtered data.
"""
# assume that X has sequences as the index and samples as columns
if sample_presence_ratio is not None:
peptide_idx = (
np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1]
>= sample_presence_ratio
)
else:
elif min_experiments is not None:
assert min_experiments is not None
assert sample_to_experiment is not None
# this is kind of confusing because of the use of numpy, but we're
# removing rows that have less than the minimum number of experiments
unique_experiments = np.unique(sample_to_experiment)
experiments_grid, s_to_e_grid = np.meshgrid(
unique_experiments, sample_to_experiment, indexing="ij"
Expand All @@ -34,12 +50,19 @@ def filter_incomplete_peptides(
peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
axis=0
) >= min_experiments
else:
raise ValueError(

Check warning on line 54 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L54

Added line #L54 was not covered by tests
"Must specify either a sample presence or n_experiments threshold"
)
return p_signal.iloc[peptide_idx]


def select_peptide_subset(
p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None
):
"""
Selects a random subset of peptides from p_signal.
"""
if keep_ratio is not None:
keep_num = int(p_signal.shape[0] * keep_ratio)
return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)]

Check warning on line 68 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L66-L68

Added lines #L66 - L68 were not covered by tests
Expand All @@ -66,7 +89,10 @@ def get_p_signal(self) -> pd.DataFrame:
sample_to_experiment=self.get_sample_to_experiment(),
)

def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]) -> np.ndarray:
"""
Get patients that have both NAT and tumor samples.
"""
samples = samples.astype(str)
samples = samples[np.char.find(samples, "IR") == -1]
tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")])
Expand All @@ -75,7 +101,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
nat_patients = np.char.replace(nat_samples, ".N", "")
return np.intersect1d(tumor_patients, nat_patients)

Check warning on line 102 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L96-L102

Added lines #L96 - L102 were not covered by tests

def get_mutations(self, mutation_names: Sequence[str] = None):
def get_mutations(self, mutation_names: Sequence[str] = None) -> pd.DataFrame:
mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
mutations = mutations.set_index("Sample.ID")
patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
Expand All @@ -84,7 +110,7 @@ def get_mutations(self, mutation_names: Sequence[str] = None):
mutations = mutations[mutation_names]
return mutations.astype(bool)

Check warning on line 111 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L105-L111

Added lines #L105 - L111 were not covered by tests

def get_hot_cold_labels(self):
def get_hot_cold_labels(self) -> pd.Series:
hot_cold = (

Check warning on line 114 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L114

Added line #L114 was not covered by tests
pd.read_csv(self.data_dir / "Hot_Cold.csv")
.dropna(axis=1)
Expand All @@ -99,13 +125,16 @@ def get_hot_cold_labels(self):
return np.squeeze(hot_cold).astype(bool)

Check warning on line 125 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L120-L125

Added lines #L120 - L125 were not covered by tests

def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
"""
Get tumor vs NAT for each of samples. Returned array contains True if
tumor.
"""
return ~np.array([sample.endswith(".N") for sample in samples])

Check warning on line 132 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L132

Added line #L132 was not covered by tests


# MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)
class EBDT:
def get_p_signal(self) -> pd.DataFrame:
"""Preprocess"""
p_signal = (

Check warning on line 138 in ddmc/datasets.py

View check run for this annotation

Codecov / codecov/patch

ddmc/datasets.py#L138

Added line #L138 was not covered by tests
pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv")
.drop("FDR", axis=1)
Expand Down
2 changes: 1 addition & 1 deletion ddmc/figures/figureM2.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,5 +196,5 @@ def impute_ddmc(p_signal, n_clusters, weight, distance_method):
return (
DDMC(n_clusters, weight, distance_method, max_iter=1)
.fit(p_signal)
.impute(p_signal)
.impute()
)
17 changes: 5 additions & 12 deletions ddmc/pam250.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class PAM250:
def __init__(self, seqs: list[str]):
# Compute all pairwise distances
self.background = MotifPam250Scores(seqs)
self.background = get_pam250_scores(seqs)
self.logWeights = 0.0

def from_summaries(self, weightsIn: np.ndarray):
Expand All @@ -17,7 +17,7 @@ def from_summaries(self, weightsIn: np.ndarray):
self.logWeights = (self.background @ weightsIn) / sums


def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
def get_pam250_scores(seqs: list[str]) -> np.ndarray:
"""Calculate and store all pairwise pam250 distances before starting."""
pam250 = substitution_matrices.load("PAM250")
seqs = np.array(
Expand All @@ -27,17 +27,10 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
# convert to np array
pam250m = np.array(pam250.values(), dtype=np.int8).reshape(pam250.shape)

out = distanceCalc(seqs, pam250m)

i_upper = np.triu_indices_from(out, k=1)
out[i_upper] = out.T[i_upper] # pylint: disable=unsubscriptable-object
return out


def distanceCalc(seqs, pam250m: np.ndarray):
"""Calculate all the pairwise distances."""
# WARNING this type can only hold -128 to 127
out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8)
i_idx, j_idx = np.tril_indices(seqs.shape[0])
out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1)

i_upper = np.triu_indices_from(out, k=1)
out[i_upper] = out.T[i_upper] # pylint: disable=unsubscriptable-object
return out
38 changes: 0 additions & 38 deletions ddmc/tests/test_CoClustering.py

This file was deleted.

30 changes: 0 additions & 30 deletions ddmc/tests/test_DifClusters.py

This file was deleted.

Loading

0 comments on commit 3a905f1

Please sign in to comment.