Cleanup to get figures building again (#533)

* Initial pass * Some fixes * Lots more cleanup * More cleanup * Fix MS7 * clean figureM2 * rename for consistency * stage * class changes * figure 2 and 3 * revert figure 4 * remake figure 4 * remove figure 4 copy * remake figure 5 * remake figure 6 * remake figure 7 * refactor * figure S2 * figure S3 * refactor pca functions * move ebdt to datasets * figure S4, renaming * figure MS5 * figure s7 * delete many things * fix tests * delete unused function, can always restore * add fit() documentation * update README and fix validation * Update build.yml * make figures more like before * small fix * small cv fold fixes * update random states --------- Co-authored-by: Armaan Abraham <[email protected]>
meyer-lab · Feb 10, 2024 · 123b392 · 123b392
1 parent d3fcbd6
commit 123b392
Show file tree

Hide file tree

Showing 32 changed files with 1,687 additions and 3,496 deletions.
diff --git a/.github/workflows/autopep8.yml b/.github/workflows/autopep8.yml
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ jobs:
     - name: Install dependencies
       run: poetry install
     - name: Build figures
-      run: make -j 3 all
+      run: make all
     - name: Upload files
       uses: actions/upload-artifact@v4
       with:

diff --git a/README.md b/README.md
@@ -3,4 +3,48 @@
 ![Build](https://github.com/meyer-lab/resistance-MS/workflows/Build/badge.svg)
 ![Test](https://github.com/meyer-lab/resistance-MS/workflows/Test/badge.svg)
 
-Predicting cell phenotype with mass spec data.
+Clusters peptides based on both sequence similarity and phosphorylation signal across samples.
+
+
+## Usage
+
+```
+>>> from ddmc.clustering import DDMC
+
+>>> # load dataset as p_signal...
+
+>>> p_signal
+             Sample 1  Sample 2  Sample 3  Sample 4  Sample 5
+Sequence                                                     
+AAAAAsQQGSA -3.583614       NaN -0.662659 -1.320029 -0.730832
+AAAAGsASPRS -0.174779 -1.796899  0.891798 -3.092941  2.394315
+AAAAGsGPSPP -1.951552 -2.937095  2.692876 -2.344894  0.556615
+AAAAGsGPsPP  3.666782       NaN -2.081231  0.989394       NaN
+AAAAPsPGSAR  1.753855 -2.135835  0.896778  3.369230  2.020967
+...               ...       ...       ...       ...       ...
+YYSPYsVSGSG -3.502871  2.831169  3.383486  2.589559  3.624968
+YYSSRsQSGGY -0.870365  0.887317  2.600291 -0.374107  3.285459
+YYTAGyNSPVK  0.249539  2.047050 -0.286033  0.042650  2.863317
+YYTSAsGDEMV  0.662787  0.135326 -1.004350  0.879398 -1.609894
+YYYSSsEDEDS       NaN -1.101679 -3.273987 -0.872370 -1.735891
+
+>>> p_signal.index  # p_signal.index contains the peptide sequences
+Index(['AAAAAsQQGSA', 'AAAAGsASPRS', 'AAAAGsGPSPP', 'AAAAGsGPsPP',
+       'AAAAPsPGSAR', 'AAAAPsPGsAR', 'AAAARsLLNHT', 'AAAARsPDRNL',
+       'AAAARtQAPPT', 'AAADFsDEDED',
+       ...
+       'YYDRMySYPAR', 'YYEDDsEGEDI', 'YYGGGsEGGRA', 'YYRNNsFTAPS',
+       'YYSPDyGLPSP', 'YYSPYsVSGSG', 'YYSSRsQSGGY', 'YYTAGyNSPVK',
+       'YYTSAsGDEMV', 'YYYSSsEDEDS'],
+      dtype='object', name='Sequence', length=30561)
+      
+>>> model = DDMC(n_components=2, seq_weight=100).fit(p_signal)  # fit model
+
+>>> model.transform(as_df=True)  # get cluster centers
+                 0         1
+Sample 1  0.017644  0.370375
+Sample 2 -0.003625 -0.914869
+Sample 3 -0.087624 -0.682140
+Sample 4  0.014644 -0.658907
+Sample 5  0.023885  0.196063
+```
diff --git a/WeightSearch.csv b/WeightSearch.csv
@@ -0,0 +1,3 @@
+,Weight,DDMC,Average,Zero,PCA
+0,0.0,,,,
+1,100.0,,,,
diff --git a/ddmc/binomial.py b/ddmc/binomial.py
@@ -5,76 +5,47 @@
 import pandas as pd
 import scipy.special as sc
 from Bio import motifs
-from Bio.Seq import Seq
+from collections import OrderedDict
 
 # Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146
 
 # Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts,
-AAfreq = {
-    "A": 0.074,
-    "R": 0.042,
-    "N": 0.044,
-    "D": 0.059,
-    "C": 0.033,
-    "Q": 0.058,
-    "E": 0.037,
-    "G": 0.074,
-    "H": 0.029,
-    "I": 0.038,
-    "L": 0.076,
-    "K": 0.072,
-    "M": 0.018,
-    "F": 0.04,
-    "P": 0.05,
-    "S": 0.081,
-    "T": 0.062,
-    "W": 0.013,
-    "Y": 0.033,
-    "V": 0.068,
-}
-AAlist = [
-    "A",
-    "C",
-    "D",
-    "E",
-    "F",
-    "G",
-    "H",
-    "I",
-    "K",
-    "L",
-    "M",
-    "N",
-    "P",
-    "Q",
-    "R",
-    "S",
-    "T",
-    "V",
-    "W",
-    "Y",
-]
+AAfreq: OrderedDict[str, float] = OrderedDict()
+AAfreq["A"] = 0.074
+AAfreq["R"] = 0.042
+AAfreq["N"] = 0.044
+AAfreq["D"] = 0.059
+AAfreq["C"] = 0.033
+AAfreq["Q"] = 0.058
+AAfreq["E"] = 0.037
+AAfreq["G"] = 0.074
+AAfreq["H"] = 0.029
+AAfreq["I"] = 0.038
+AAfreq["L"] = 0.076
+AAfreq["K"] = 0.072
+AAfreq["M"] = 0.018
+AAfreq["F"] = 0.040
+AAfreq["P"] = 0.050
+AAfreq["S"] = 0.081
+AAfreq["T"] = 0.062
+AAfreq["W"] = 0.013
+AAfreq["Y"] = 0.033
+AAfreq["V"] = 0.068
+
+AAlist = list(AAfreq.keys())
 
 
 def position_weight_matrix(seqs, pseudoC=AAfreq):
     """Build PWM of a given set of sequences."""
     return frequencies(seqs).normalize(pseudocounts=pseudoC)
 
 
-def frequencies(seqs):
+def frequencies(seqs: list[str]):
     """Build counts matrix of a given set of sequences."""
-    return motifs.create(seqs, alphabet=AAlist).counts
+    return motifs.create(seqs, alphabet="".join(AAlist)).counts
 
 
-def InformationContent(seqs):
-    """The mean of the PSSM is particularly important becuase its value is equal to the
-    Kullback-Leibler divergence or relative entropy, and is a measure for the information content
-    of the motif compared to the background."""
-    pssm = position_weight_matrix(seqs).log_odds(AAfreq)
-    return pssm.mean(AAfreq)
-
-
-def GenerateBinarySeqID(seqs):
+def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
     """Build matrix with 0s and 1s to identify residue/position pairs for every sequence"""
     res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool)
     for ii, seq in enumerate(seqs):
@@ -83,15 +54,15 @@ def GenerateBinarySeqID(seqs):
     return res
 
 
-def BackgroundSeqs(forseqs):
+def BackgroundSeqs(forseqs: np.ndarray[str]) -> list[str]:
     """Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
     Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
     Source: https://www.phosphosite.org/staticDownloads.action -
     Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
     Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
     PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
     # Get porportion of psite types in foreground set
-    forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5)
+    forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs)
     forw_tot = forw_pYn + forw_pSn + forw_pTn
 
     pYf = forw_pYn / forw_tot
@@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs):
     PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")]
     refseqs = list(PsP["SITE_+/-7_AA"])
     len_bg = int(len(refseqs))
-    backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7)
+    backg_pYn, _, _ = CountPsiteTypes(refseqs)
 
     # Make sure there are enough pY peptides to meet proportions
     if backg_pYn >= len_bg * pYf:
@@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs):
     return bg_seqs
 
 
-def BackgProportions(refseqs, pYn, pSn, pTn):
+def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]:
     """Provided the proportions, add peptides to background set."""
-    y_seqs, s_seqs, t_seqs = [], [], []
+    y_seqs: list[str] = []
+    s_seqs: list[str] = []
+    t_seqs: list[str] = []
+
     pR = ["y", "t", "s"]
     for seq in refseqs:
         if seq[7] not in pR:
@@ -144,50 +118,56 @@ def BackgProportions(refseqs, pYn, pSn, pTn):
         ), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq)
 
         if motif[5] == "Y" and len(y_seqs) < pYn:
-            y_seqs.append(Seq(motif))
+            y_seqs.append(motif)
 
         if motif[5] == "S" and len(s_seqs) < pSn:
-            s_seqs.append(Seq(motif))
+            s_seqs.append(motif)
 
         if motif[5] == "T" and len(t_seqs) < pTn:
-            t_seqs.append(Seq(motif))
+            t_seqs.append(motif)
 
     return y_seqs + s_seqs + t_seqs
 
 
 class Binomial:
-    """Create a binomial distance distribution."""
+    """Definition of the binomial sequence distance distribution."""
 
-    def __init__(self, seq, seqs):
+    def __init__(self, seqs: np.ndarray[str]):
         # Background sequences
-        background = position_weight_matrix(BackgroundSeqs(seq))
-        self.background = (
-            np.array([background[AA] for AA in AAlist]),
-            GenerateBinarySeqID(seqs),
-        )
+        background = position_weight_matrix(BackgroundSeqs(seqs))
+        self.background = np.array([background[AA] for AA in AAlist])
+        self.foreground: np.ndarray = GenerateBinarySeqID(seqs)
 
         self.logWeights = 0.0
-        assert np.all(np.isfinite(self.background[0]))
-        assert np.all(np.isfinite(self.background[1]))
+        assert np.all(np.isfinite(self.background))
+        assert np.all(np.isfinite(self.foreground))
 
-    def from_summaries(self, weightsIn):
+    def from_summaries(self, weightsIn: np.ndarray):
         """Update the underlying distribution."""
-        k = np.einsum("kji,kl->lji", self.background[1], weightsIn)
+        k = np.einsum("kji,kl->lji", self.foreground, weightsIn)
         betaA = np.sum(weightsIn, axis=0)[:, None, None] - k
         betaA = np.clip(betaA, 0.001, np.inf)
-        probmat = sc.betainc(betaA, k + 1, 1 - self.background[0])
-        tempp = np.einsum("ijk,ljk->il", self.background[1], probmat)
+        probmat = sc.betainc(betaA, k + 1, 1 - self.background)
+        tempp = np.einsum("ijk,ljk->il", self.foreground, probmat)
         self.logWeights = np.log(tempp)
 
 
-def CountPsiteTypes(X, cA):
-    """Count number of different phosphorylation types in a MS data set."""
-    positionSeq = [seq[cA] for seq in X]
-    pS = positionSeq.count("s")
-    pT = positionSeq.count("t")
-    pY = positionSeq.count("y")
+def CountPsiteTypes(X: np.ndarray[str]) -> tuple[int, int, int]:
+    """Count the number of different phosphorylation types in an MS data set.
+
+    Args:
+        X (list[str]): The list of peptide sequences.
+
+    Returns:
+        tuple[int, int, int]: The number of pY, pS, and pT sites.
+    """
+    X = np.char.upper(X)
 
-    countt = [sum(map(str.islower, seq)) for seq in X]
-    primed = sum(map(lambda i: i > 1, countt))
+    # Find the center amino acid
+    cA = int((len(X[0]) - 1) / 2)
 
-    return pY, pS, pT, primed
+    phospho_aminos = [seq[cA] for seq in X]
+    pS = phospho_aminos.count("S")
+    pT = phospho_aminos.count("T")
+    pY = phospho_aminos.count("Y")
+    return pY, pS, pT