Skip to content

Commit

Permalink
figure 2 and 3
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Jan 31, 2024
1 parent 33da12c commit 31f9b86
Show file tree
Hide file tree
Showing 16 changed files with 397 additions and 279 deletions.
5 changes: 3 additions & 2 deletions WeightSearch.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
,Clusters,DDMC,Average,Zero,PCA
0,1.0,,,,
,Weight,DDMC,Average,Zero,PCA
0,0.0,,,,
1,100.0,,,,
20 changes: 11 additions & 9 deletions ddmc/binomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
return res


def BackgroundSeqs(forseqs: pd.Series) -> list[str]:
def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]:
"""Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
Source: https://www.phosphosite.org/staticDownloads.action -
Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
# Get porportion of psite types in foreground set
forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs.astype(str).tolist())
forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs)
forw_tot = forw_pYn + forw_pSn + forw_pTn

pYf = forw_pYn / forw_tot
Expand Down Expand Up @@ -132,9 +132,9 @@ def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[s
class Binomial:
"""Definition of the binomial sequence distance distribution."""

def __init__(self, seq: pd.Series, seqs: list[str]):
def __init__(self, seqs: np.ndarray[str]):
# Background sequences
background = position_weight_matrix(BackgroundSeqs(seq))
background = position_weight_matrix(BackgroundSeqs(seqs))
self.background = np.array([background[AA] for AA in AAlist])
self.foreground: np.ndarray = GenerateBinarySeqID(seqs)

Expand All @@ -152,7 +152,7 @@ def from_summaries(self, weightsIn: np.ndarray):
self.logWeights = np.log(tempp)


def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]:
"""Count the number of different phosphorylation types in an MS data set.
Args:
Expand All @@ -161,11 +161,13 @@ def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
Returns:
tuple[int, int, int]: The number of pY, pS, and pT sites.
"""
X = np.char.upper(X)

# Find the center amino acid
cA = int((len(X[0]) - 1) / 2)

positionSeq = [seq[cA] for seq in X]
pS = positionSeq.count("s")
pT = positionSeq.count("t")
pY = positionSeq.count("y")
phospho_aminos = [seq[cA] for seq in X]
pS = phospho_aminos.count("S")
pT = phospho_aminos.count("T")
pY = phospho_aminos.count("Y")
return pY, pS, pT
113 changes: 69 additions & 44 deletions ddmc/clustering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Clustering functions. """

from typing import Literal
from typing import Literal, List, Dict
import warnings
from copy import deepcopy
import itertools
Expand All @@ -10,7 +10,7 @@
from sklearn.utils.validation import check_is_fitted
from .binomial import Binomial, AAlist, BackgroundSeqs, frequencies
from .pam250 import PAM250
from .motifs import PSPLdict, compute_control_pssm
from .motifs import get_pspls, compute_control_pssm
from fancyimpute import SoftImpute


Expand All @@ -21,7 +21,7 @@ class DDMC(GaussianMixture):

def __init__(
self,
sequences: pd.Series,
sequences,
n_components: int,
seq_weight: float,
distance_method: Literal["PAM250", "Binomial"],
Expand All @@ -37,24 +37,25 @@ def __init__(
tol=tol,
random_state=random_state,
)

self.sequences = sequences
self.gen_peptide_distances(sequences, distance_method)
self.seq_weight = seq_weight
self.distance_method = distance_method

seqs = sequences.str.upper().to_list()



def gen_peptide_distances(self, seqs, distance_method):
def gen_peptide_distances(self, seqs: np.ndarray | pd.DataFrame, distance_method):
# store parameters for sklearn's checks
self.distance_method = distance_method
if not isinstance(seqs, np.ndarray):
seqs = seqs.values
if seqs.dtype != str:
seqs = seqs.astype("str")
seqs = np.char.upper(seqs)
self.sequences = seqs
if distance_method == "PAM250":
self.seq_dist: PAM250 | Binomial = PAM250(seqs)
elif distance_method == "Binomial":
self.seq_dist = Binomial(sequences, seqs)
self.seq_dist = Binomial(seqs)
else:
raise ValueError("Wrong distance type.")


def _estimate_log_prob(self, X: np.ndarray):
"""Estimate the log-probability of each point in each cluster."""
logp = super()._estimate_log_prob(X) # Do the regular work
Expand Down Expand Up @@ -87,9 +88,18 @@ def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
# Do sequence m step
self.seq_dist.from_summaries(np.exp(log_resp))

def fit(self, X: np.ndarray | pd.DataFrame, y=None):
"""Compute EM clustering"""
d = np.array(X.T)
def fit(self, X: pd.DataFrame):
"""
Compute EM clustering
Args:
X: dataframe consisting of a "Sequence" column, and sample
columns. Every column that is not named "Sequence" will be treated
as a sample.
"""
# TODO: assert that the peptides passed in match the length of X
# TODO: probably just pass in sequences here
d = np.array(X)

if np.any(np.isnan(d)):
self._missing = True
Expand Down Expand Up @@ -151,12 +161,12 @@ def impute(self, X: np.ndarray) -> np.ndarray:
assert np.all(np.isfinite(X))
return X

def pssms(self, PsP_background=False):
def get_pssms(self, PsP_background=False, clusters: List=None):
"""Compute position-specific scoring matrix of each cluster.
Note, to normalize by amino acid frequency this uses either
all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
"""
pssms, cl_num = [], []
pssms, pssm_names = [], []
if PsP_background:
bg_seqs = BackgroundSeqs(self.sequences)
back_pssm = compute_control_pssm(bg_seqs)
Expand Down Expand Up @@ -200,7 +210,7 @@ def pssms(self, PsP_background=False):
pssm.index = AAlist

# Normalize phosphoacceptor position to frequency
df = pd.DataFrame(self.sequences.str.upper())
df = pd.DataFrame({"Sequence": self.sequences})
df["Cluster"] = self.labels()
clSeq = df[df["Cluster"] == ii]["Sequence"]
clSeq = pd.DataFrame(frequencies(clSeq)).T
Expand All @@ -209,34 +219,33 @@ def pssms(self, PsP_background=False):
pssm.loc[p_site, 5] = np.log2(clSeq.loc[p_site, 5] / tm)

pssms.append(np.clip(pssm, a_min=0, a_max=3))
cl_num.append(ii)
pssm_names.append(ii)

pssm_names, pssms = np.array(pssm_names), np.array(pssms)

if clusters is not None:
return pssms[[np.where(pssm_names == cluster)[0][0] for cluster in clusters]]

return pssms, cl_num
return pssm_names, pssms

def predict_UpstreamKinases(
self, additional_pssms=False, add_labels=False, PsP_background=True
def predict_upstream_kinases(
self,
PsP_background=True,
):
"""Compute matrix-matrix similarity between kinase specificity profiles and cluster PSSMs to identify upstream kinases regulating clusters."""
PSPLs = PSPLdict()
PSSMs, cl_num = self.pssms(PsP_background=PsP_background)

# Optionally add external pssms
if not isinstance(additional_pssms, bool):
PSSMs += additional_pssms
cl_num += add_labels
PSSMs = [
np.delete(np.array(list(np.array(mat))), [5, 10], axis=1) for mat in PSSMs
] # Remove P0 and P+5 from pssms

a = np.zeros((len(PSPLs), len(PSSMs)))
for ii, spec_profile in enumerate(PSPLs.values()):
for jj, pssm in enumerate(PSSMs):
a[ii, jj] = np.linalg.norm(pssm - spec_profile)

table = pd.DataFrame(a)
table.columns = cl_num
table.insert(0, "Kinase", list(PSPLdict().keys()))
return table
"""Compute matrix-matrix similarity between kinase specificity profiles
and cluster PSSMs to identify upstream kinases regulating clusters."""
kinases, pspls = get_pspls()
clusters, pssms = self.get_pssms(PsP_background=PsP_background)

distances = get_pspl_pssm_distances(
pspls,
pssms,
as_df=True,
pssm_names=clusters,
kinases=kinases,
)

return distances

def predict(self) -> np.ndarray:
"""Provided the current model parameters, predict the cluster each peptide belongs to."""
Expand All @@ -252,3 +261,19 @@ def score(self) -> float:
check_is_fitted(self, ["lower_bound_"])
return self.lower_bound_


def get_pspl_pssm_distances(
pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None
) -> np.ndarray | pd.DataFrame:
"""
Args:
pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
pssms: position-specific scoring matrices of shape (n_peptides, 20, 11)
"""
assert pssms.shape[1:3] == (20, 11)
assert pspls.shape[1:3] == (20, 9)
pssms = np.delete(pssms, [5, 10], axis=2)
dists = np.linalg.norm(pspls[:, None, :, :] - pssms[None, :, :, :], axis=(2, 3))
if as_df:
dists = pd.DataFrame(dists, index=kinases, columns=pssm_names)
return dists
Loading

0 comments on commit 31f9b86

Please sign in to comment.