figure 2 and 3

meyer-lab · Jan 31, 2024 · 31f9b86 · 31f9b86
1 parent 33da12c
commit 31f9b86
Show file tree

Hide file tree

Showing 16 changed files with 397 additions and 279 deletions.
diff --git a/WeightSearch.csv b/WeightSearch.csv
@@ -1,2 +1,3 @@
-,Clusters,DDMC,Average,Zero,PCA
-0,1.0,,,,
+,Weight,DDMC,Average,Zero,PCA
+0,0.0,,,,
+1,100.0,,,,
diff --git a/ddmc/binomial.py b/ddmc/binomial.py
@@ -54,15 +54,15 @@ def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
     return res
 
 
-def BackgroundSeqs(forseqs: pd.Series) -> list[str]:
+def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]:
     """Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
     Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
     Source: https://www.phosphosite.org/staticDownloads.action -
     Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
     Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
     PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
     # Get porportion of psite types in foreground set
-    forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs.astype(str).tolist())
+    forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs)
     forw_tot = forw_pYn + forw_pSn + forw_pTn
 
     pYf = forw_pYn / forw_tot
@@ -132,9 +132,9 @@ def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[s
 class Binomial:
     """Definition of the binomial sequence distance distribution."""
 
-    def __init__(self, seq: pd.Series, seqs: list[str]):
+    def __init__(self, seqs: np.ndarray[str]):
         # Background sequences
-        background = position_weight_matrix(BackgroundSeqs(seq))
+        background = position_weight_matrix(BackgroundSeqs(seqs))
         self.background = np.array([background[AA] for AA in AAlist])
         self.foreground: np.ndarray = GenerateBinarySeqID(seqs)
 
@@ -152,7 +152,7 @@ def from_summaries(self, weightsIn: np.ndarray):
         self.logWeights = np.log(tempp)
 
 
-def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
+def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]:
     """Count the number of different phosphorylation types in an MS data set.
 
     Args:
@@ -161,11 +161,13 @@ def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
     Returns:
         tuple[int, int, int]: The number of pY, pS, and pT sites.
     """
+    X = np.char.upper(X)
+
     # Find the center amino acid
     cA = int((len(X[0]) - 1) / 2)
 
-    positionSeq = [seq[cA] for seq in X]
-    pS = positionSeq.count("s")
-    pT = positionSeq.count("t")
-    pY = positionSeq.count("y")
+    phospho_aminos = [seq[cA] for seq in X]
+    pS = phospho_aminos.count("S")
+    pT = phospho_aminos.count("T")
+    pY = phospho_aminos.count("Y")
     return pY, pS, pT
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -1,6 +1,6 @@
 """ Clustering functions. """
 
-from typing import Literal
+from typing import Literal, List, Dict
 import warnings
 from copy import deepcopy
 import itertools
@@ -10,7 +10,7 @@
 from sklearn.utils.validation import check_is_fitted
 from .binomial import Binomial, AAlist, BackgroundSeqs, frequencies
 from .pam250 import PAM250
-from .motifs import PSPLdict, compute_control_pssm
+from .motifs import get_pspls, compute_control_pssm
 from fancyimpute import SoftImpute
 
 
@@ -21,7 +21,7 @@ class DDMC(GaussianMixture):
 
     def __init__(
         self,
-        sequences: pd.Series,
+        sequences,
         n_components: int,
         seq_weight: float,
         distance_method: Literal["PAM250", "Binomial"],
@@ -37,24 +37,25 @@ def __init__(
             tol=tol,
             random_state=random_state,
         )
-
-        self.sequences = sequences
+        self.gen_peptide_distances(sequences, distance_method)
         self.seq_weight = seq_weight
-        self.distance_method = distance_method
 
-        seqs = sequences.str.upper().to_list()
-
-
-
-    def gen_peptide_distances(self, seqs, distance_method):
+    def gen_peptide_distances(self, seqs: np.ndarray | pd.DataFrame, distance_method):
+        # store parameters for sklearn's checks
+        self.distance_method = distance_method
+        if not isinstance(seqs, np.ndarray):
+            seqs = seqs.values
+        if seqs.dtype != str:
+            seqs = seqs.astype("str")
+        seqs = np.char.upper(seqs)
+        self.sequences = seqs
         if distance_method == "PAM250":
             self.seq_dist: PAM250 | Binomial = PAM250(seqs)
         elif distance_method == "Binomial":
-            self.seq_dist = Binomial(sequences, seqs)
+            self.seq_dist = Binomial(seqs)
         else:
             raise ValueError("Wrong distance type.")
 
-
     def _estimate_log_prob(self, X: np.ndarray):
         """Estimate the log-probability of each point in each cluster."""
         logp = super()._estimate_log_prob(X)  # Do the regular work
@@ -87,9 +88,18 @@ def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
         # Do sequence m step
         self.seq_dist.from_summaries(np.exp(log_resp))
 
-    def fit(self, X: np.ndarray | pd.DataFrame, y=None):
-        """Compute EM clustering"""
-        d = np.array(X.T)
+    def fit(self, X: pd.DataFrame):
+        """
+        Compute EM clustering
+
+        Args:
+            X: dataframe consisting of a "Sequence" column, and sample
+                columns. Every column that is not named "Sequence" will be treated
+                as a sample.
+        """
+        # TODO: assert that the peptides passed in match the length of X
+        # TODO: probably just pass in sequences here
+        d = np.array(X)
 
         if np.any(np.isnan(d)):
             self._missing = True
@@ -151,12 +161,12 @@ def impute(self, X: np.ndarray) -> np.ndarray:
         assert np.all(np.isfinite(X))
         return X
 
-    def pssms(self, PsP_background=False):
+    def get_pssms(self, PsP_background=False, clusters: List=None):
         """Compute position-specific scoring matrix of each cluster.
         Note, to normalize by amino acid frequency this uses either
         all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
         """
-        pssms, cl_num = [], []
+        pssms, pssm_names = [], []
         if PsP_background:
             bg_seqs = BackgroundSeqs(self.sequences)
             back_pssm = compute_control_pssm(bg_seqs)
@@ -200,7 +210,7 @@ def pssms(self, PsP_background=False):
             pssm.index = AAlist
 
             # Normalize phosphoacceptor position to frequency
-            df = pd.DataFrame(self.sequences.str.upper())
+            df = pd.DataFrame({"Sequence": self.sequences})
             df["Cluster"] = self.labels()
             clSeq = df[df["Cluster"] == ii]["Sequence"]
             clSeq = pd.DataFrame(frequencies(clSeq)).T
@@ -209,34 +219,33 @@ def pssms(self, PsP_background=False):
                 pssm.loc[p_site, 5] = np.log2(clSeq.loc[p_site, 5] / tm)
 
             pssms.append(np.clip(pssm, a_min=0, a_max=3))
-            cl_num.append(ii)
+            pssm_names.append(ii)
+
+        pssm_names, pssms = np.array(pssm_names), np.array(pssms)
+
+        if clusters is not None:
+            return pssms[[np.where(pssm_names == cluster)[0][0] for cluster in clusters]]
 
-        return pssms, cl_num
+        return pssm_names, pssms
 
-    def predict_UpstreamKinases(
-        self, additional_pssms=False, add_labels=False, PsP_background=True
+    def predict_upstream_kinases(
+        self,
+        PsP_background=True,
     ):
-        """Compute matrix-matrix similarity between kinase specificity profiles and cluster PSSMs to identify upstream kinases regulating clusters."""
-        PSPLs = PSPLdict()
-        PSSMs, cl_num = self.pssms(PsP_background=PsP_background)
-
-        # Optionally add external pssms
-        if not isinstance(additional_pssms, bool):
-            PSSMs += additional_pssms
-            cl_num += add_labels
-        PSSMs = [
-            np.delete(np.array(list(np.array(mat))), [5, 10], axis=1) for mat in PSSMs
-        ]  # Remove P0 and P+5 from pssms
-
-        a = np.zeros((len(PSPLs), len(PSSMs)))
-        for ii, spec_profile in enumerate(PSPLs.values()):
-            for jj, pssm in enumerate(PSSMs):
-                a[ii, jj] = np.linalg.norm(pssm - spec_profile)
-
-        table = pd.DataFrame(a)
-        table.columns = cl_num
-        table.insert(0, "Kinase", list(PSPLdict().keys()))
-        return table
+        """Compute matrix-matrix similarity between kinase specificity profiles
+        and cluster PSSMs to identify upstream kinases regulating clusters."""
+        kinases, pspls = get_pspls()
+        clusters, pssms = self.get_pssms(PsP_background=PsP_background)
+
+        distances = get_pspl_pssm_distances(
+            pspls,
+            pssms,
+            as_df=True,
+            pssm_names=clusters,
+            kinases=kinases,
+        )
+
+        return distances
 
     def predict(self) -> np.ndarray:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""
@@ -252,3 +261,19 @@ def score(self) -> float:
         check_is_fitted(self, ["lower_bound_"])
         return self.lower_bound_
 
+
+def get_pspl_pssm_distances(
+    pspls: np.ndarray, pssms: np.ndarray, as_df=False, pssm_names=None, kinases=None
+) -> np.ndarray | pd.DataFrame:
+    """
+    Args:
+        pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
+        pssms: position-specific scoring matrices of shape (n_peptides, 20, 11) 
+    """
+    assert pssms.shape[1:3] == (20, 11)
+    assert pspls.shape[1:3] == (20, 9)
+    pssms = np.delete(pssms, [5, 10], axis=2)
+    dists = np.linalg.norm(pspls[:, None, :, :] - pssms[None, :, :, :], axis=(2, 3))
+    if as_df:
+        dists = pd.DataFrame(dists, index=kinases, columns=pssm_names)
+    return dists
diff --git a/...ata/MS/CPTAC/CPTAC-preprocessedMotfis.csv → ...ata/MS/CPTAC/CPTAC-preprocessedMotifs.csv b/...ata/MS/CPTAC/CPTAC-preprocessedMotfis.csv → ...ata/MS/CPTAC/CPTAC-preprocessedMotifs.csv