Skip to content

Commit

Permalink
remake figure 4
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 1, 2024
1 parent 13f29de commit d15659d
Show file tree
Hide file tree
Showing 9 changed files with 513 additions and 234 deletions.
71 changes: 41 additions & 30 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class DDMC(GaussianMixture):

def __init__(
self,
sequences,
n_components: int,
seq_weight: float,
distance_method: Literal["PAM250", "Binomial"],
Expand All @@ -37,22 +36,19 @@ def __init__(
tol=tol,
random_state=random_state,
)
self.gen_peptide_distances(sequences, distance_method)
self.distance_method = distance_method
self.seq_weight = seq_weight

def gen_peptide_distances(self, seqs: np.ndarray | pd.DataFrame, distance_method):
def gen_peptide_distances(self, sequences: np.ndarray, distance_method):
# store parameters for sklearn's checks
self.distance_method = distance_method
if not isinstance(seqs, np.ndarray):
seqs = seqs.values
if seqs.dtype != str:
seqs = seqs.astype("str")
seqs = np.char.upper(seqs)
self.sequences = seqs
if sequences.dtype != str:
sequences = sequences.astype("str")
sequences = np.char.upper(sequences)
self.sequences = sequences
if distance_method == "PAM250":
self.seq_dist: PAM250 | Binomial = PAM250(seqs)
self.seq_dist: PAM250 | Binomial = PAM250(sequences)
elif distance_method == "Binomial":
self.seq_dist = Binomial(seqs)
self.seq_dist = Binomial(sequences)
else:
raise ValueError("Wrong distance type.")

Expand Down Expand Up @@ -88,7 +84,7 @@ def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
# Do sequence m step
self.seq_dist.from_summaries(np.exp(log_resp))

def fit(self, X: pd.DataFrame):
def fit(self, p_signal: pd.DataFrame):
"""
Compute EM clustering
Expand All @@ -99,20 +95,24 @@ def fit(self, X: pd.DataFrame):
"""
# TODO: assert that the peptides passed in match the length of X
# TODO: probably just pass in sequences here
d = np.array(X)
assert isinstance(p_signal, pd.DataFrame)
sequences = p_signal.index.values
assert isinstance(sequences[0], str) and len(sequences[0]) == 11, "The index of p_signal must be the peptide sequences of length 11"
self.p_signal = p_signal
self.gen_peptide_distances(sequences, self.distance_method)

if np.any(np.isnan(d)):
if np.any(np.isnan(p_signal)):
self._missing = True
self.missing_d = np.isnan(d)
self.missing_d = np.isnan(p_signal)

with warnings.catch_warnings():
warnings.simplefilter("ignore")
d = SoftImpute(verbose=False).fit_transform(d)
p_signal = SoftImpute(verbose=False).fit_transform(p_signal)
else:
self._missing = False

super().fit(d)
self.scores_ = self.predict_proba(d)
super().fit(p_signal)
self.scores_ = self.predict_proba(p_signal)

assert np.all(np.isfinite(self.scores_))
assert np.all(np.isfinite(self.seq_scores_))
Expand Down Expand Up @@ -143,10 +143,21 @@ def wins(self, X):

return (dataDist, seqDist)

def transform(self) -> np.ndarray:
"""Calculate cluster averages."""
def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
"""
Return cluster centers.
Args:
as_df: Whether or not the result should be wrapped in a dataframe with labeled axes.
Returns:
The cluster centers, either a np array or pd df of shape (n_samples, n_components).
"""
check_is_fitted(self, ["means_"])
return self.means_.T
centers = self.means_.T
if as_df:
centers = pd.DataFrame(centers, index=self.p_signal.columns, columns=np.arange(self.n_components))
return centers

def impute(self, X: np.ndarray) -> np.ndarray:
"""Impute a matching dataset."""
Expand All @@ -166,19 +177,19 @@ def get_pssms(self, PsP_background=False, clusters: List=None):
Note, to normalize by amino acid frequency this uses either
all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
"""
pssms, pssm_names = [], []
pssm_names, pssms = [], []
if PsP_background:
bg_seqs = BackgroundSeqs(self.sequences)
back_pssm = compute_control_pssm(bg_seqs)
else:
back_pssm = np.zeros((len(AAlist), 11), dtype=float)

l1 = list(np.arange(self.n_components) + 1)
l2 = list(set(self.labels()))
ec = [i for i in l1 + l2 if i not in l1 or i not in l2]
for ii in range(1, self.n_components + 1):
# Check for empty clusters and ignore them, if there are
l1 = list(np.arange(self.n_components) + 1)
l2 = list(set(self.labels()))
ec = [i for i in l1 + l2 if i not in l1 or i not in l2]
if ii in ec:
continue
if ii in ec: continue

# Compute PSSM
pssm = np.zeros((len(AAlist), 11), dtype=float)
Expand Down Expand Up @@ -254,7 +265,7 @@ def predict(self) -> np.ndarray:

def labels(self) -> np.ndarray:
"""Find cluster assignment with highest likelihood for each peptide."""
return self.predict() + 1
return self.predict()

def score(self) -> float:
"""Generate score of the fitting."""
Expand All @@ -268,7 +279,7 @@ def get_pspl_pssm_distances(
"""
Args:
pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
pssms: position-specific scoring matrices of shape (n_peptides, 20, 11)
pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
"""
assert pssms.shape[1:3] == (20, 11)
assert pspls.shape[1:3] == (20, 9)
Expand Down
93 changes: 93 additions & 0 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import numpy as np
import pandas as pd
from typing import Sequence

from pathlib import Path

DATA_DIR = Path(__file__).parent / "data"


def filter_incomplete_peptides(
p_signal: pd.DataFrame,
sample_presence_perc: float = None,
min_experiments: int = None,
sample_to_experiment: np.ndarray = None,
):
# assume that X has sequences as the index and samples as columns
if sample_presence_perc is not None:
peptide_idx = (
np.count_nonzero(~np.isnan(p_signal), axis=1) / p_signal.shape[1]
>= sample_presence_perc
)
else:
assert min_experiments is not None
assert sample_to_experiment is not None
unique_experiments = np.unique(sample_to_experiment)
experiments_grid, StoE_grid = np.meshgrid(
unique_experiments, sample_to_experiment, indexing="ij"
)
bool_matrix = experiments_grid == StoE_grid
present = ~np.isnan(p_signal.values)
peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
axis=0
) > min_experiments
return p_signal.iloc[peptide_idx]


def select_peptide_subset(
p_signal: pd.DataFrame, keep_ratio: float = None, keep_num: int = None
):
if keep_ratio is not None:
keep_num = int(p_signal.shape[0] * keep_ratio)
return p_signal.iloc[np.random.choice(p_signal.shape[0], keep_num)]


class CPTAC:
data_dir = DATA_DIR / "MS" / "CPTAC"

def load_sample_to_experiment(self, as_df=False):
sample_to_experiment = pd.read_csv(self.data_dir / "IDtoExperiment.csv")
if as_df:
return sample_to_experiment
return sample_to_experiment.iloc[:, 1].values

def get_p_signal(self) -> pd.DataFrame:
p_signal = pd.read_csv(self.data_dir / "CPTAC-preprocessedMotifs.csv").iloc[
:, 1:
]
p_signal = p_signal.set_index("Sequence")
p_signal = p_signal.drop(columns=["Protein", "Gene", "Position"])
return filter_incomplete_peptides(
p_signal,
min_experiments=2,
sample_to_experiment=self.load_sample_to_experiment(),
)

def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
samples = samples.astype(str)
samples = samples[np.char.find(samples, "IR") == -1]
tumor_samples = np.sort(samples[~np.char.endswith(samples, ".N")])
nat_samples = np.sort(samples[np.char.endswith(samples, ".N")])
tumor_patients = tumor_samples
nat_patients = np.char.replace(nat_samples, ".N", "")
return np.intersect1d(tumor_patients, nat_patients)

def get_mutations(self, mutation_names: Sequence[str]):
mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
mutations = mutations.set_index("Sample.ID")
patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
mutations = mutations.loc[patients]
return mutations[mutation_names]

def get_hot_cold_labels(self):
hot_cold = (
pd.read_csv(self.data_dir / "Hot_Cold.csv")
.dropna(axis=1)
.sort_values(by="Sample ID")
.set_index("Sample ID")
)["Group"]
hot_cold = hot_cold[hot_cold != "NAT enriched"]
hot_cold = hot_cold.replace("Cold-tumor enriched", 0)
hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
hot_cold = hot_cold.dropna()
return np.squeeze(hot_cold)
1 change: 1 addition & 0 deletions ddmc/figures/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import sys
import time
from pathlib import Path
from string import ascii_uppercase
from matplotlib import gridspec, pyplot as plt, axes, rcParams
import seaborn as sns
Expand Down
Loading

0 comments on commit d15659d

Please sign in to comment.