Initial pass

meyer-lab · Jan 20, 2024 · 7fb1c7a · 7fb1c7a
1 parent d3fcbd6
commit 7fb1c7a
Show file tree

Hide file tree

Showing 21 changed files with 155 additions and 739 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ jobs:
     - name: Install dependencies
       run: poetry install
     - name: Build figures
-      run: make -j 3 all
+      run: make -i all
     - name: Upload files
       uses: actions/upload-artifact@v4
       with:

diff --git a/ddmc/binomial.py b/ddmc/binomial.py
@@ -5,76 +5,47 @@
 import pandas as pd
 import scipy.special as sc
 from Bio import motifs
-from Bio.Seq import Seq
+from collections import OrderedDict
 
 # Binomial method inspired by Schwartz & Gygi's Nature Biotech 2005: doi:10.1038/nbt1146
 
 # Amino acids frequencies (http://www.tiem.utk.edu/~gross/bioed/webmodules/aminoacid.htm) used for pseudocounts,
-AAfreq = {
-    "A": 0.074,
-    "R": 0.042,
-    "N": 0.044,
-    "D": 0.059,
-    "C": 0.033,
-    "Q": 0.058,
-    "E": 0.037,
-    "G": 0.074,
-    "H": 0.029,
-    "I": 0.038,
-    "L": 0.076,
-    "K": 0.072,
-    "M": 0.018,
-    "F": 0.04,
-    "P": 0.05,
-    "S": 0.081,
-    "T": 0.062,
-    "W": 0.013,
-    "Y": 0.033,
-    "V": 0.068,
-}
-AAlist = [
-    "A",
-    "C",
-    "D",
-    "E",
-    "F",
-    "G",
-    "H",
-    "I",
-    "K",
-    "L",
-    "M",
-    "N",
-    "P",
-    "Q",
-    "R",
-    "S",
-    "T",
-    "V",
-    "W",
-    "Y",
-]
+AAfreq: OrderedDict[str, float] = OrderedDict()
+AAfreq["A"] = 0.074
+AAfreq["R"] = 0.042
+AAfreq["N"] = 0.044
+AAfreq["D"] = 0.059
+AAfreq["C"] = 0.033
+AAfreq["Q"] = 0.058
+AAfreq["E"] = 0.037
+AAfreq["G"] = 0.074
+AAfreq["H"] = 0.029
+AAfreq["I"] = 0.038
+AAfreq["L"] = 0.076
+AAfreq["K"] = 0.072
+AAfreq["M"] = 0.018
+AAfreq["F"] = 0.040
+AAfreq["P"] = 0.050
+AAfreq["S"] = 0.081
+AAfreq["T"] = 0.062
+AAfreq["W"] = 0.013
+AAfreq["Y"] = 0.033
+AAfreq["V"] = 0.068
+
+AAlist = list(AAfreq.keys())
 
 
 def position_weight_matrix(seqs, pseudoC=AAfreq):
     """Build PWM of a given set of sequences."""
     return frequencies(seqs).normalize(pseudocounts=pseudoC)
 
 
-def frequencies(seqs):
+def frequencies(seqs: list[str]):
     """Build counts matrix of a given set of sequences."""
-    return motifs.create(seqs, alphabet=AAlist).counts
+    return motifs.create(seqs, alphabet="".join(AAlist)).counts
 
 
-def InformationContent(seqs):
-    """The mean of the PSSM is particularly important becuase its value is equal to the
-    Kullback-Leibler divergence or relative entropy, and is a measure for the information content
-    of the motif compared to the background."""
-    pssm = position_weight_matrix(seqs).log_odds(AAfreq)
-    return pssm.mean(AAfreq)
-
-
-def GenerateBinarySeqID(seqs):
+def GenerateBinarySeqID(seqs: list[str]) -> np.ndarray:
     """Build matrix with 0s and 1s to identify residue/position pairs for every sequence"""
     res = np.zeros((len(seqs), len(AAlist), 11), dtype=bool)
     for ii, seq in enumerate(seqs):
@@ -83,15 +54,15 @@ def GenerateBinarySeqID(seqs):
     return res
 
 
-def BackgroundSeqs(forseqs):
+def BackgroundSeqs(forseqs: pd.Series) -> list[str]:
     """Build Background data set with the same proportion of pY, pT, and pS motifs as in the foreground set of sequences.
     Note this PsP data set contains 51976 pY, 226131 pS, 81321 pT
     Source: https://www.phosphosite.org/staticDownloads.action -
     Phosphorylation_site_dataset.gz - Last mod: Wed Dec 04 14:56:35 EST 2019
     Cite: Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,
     PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926"""
     # Get porportion of psite types in foreground set
-    forw_pYn, forw_pSn, forw_pTn, _ = CountPsiteTypes(forseqs, 5)
+    forw_pYn, forw_pSn, forw_pTn = CountPsiteTypes(forseqs.astype(str).tolist())
     forw_tot = forw_pYn + forw_pSn + forw_pTn
 
     pYf = forw_pYn / forw_tot
@@ -106,7 +77,7 @@ def BackgroundSeqs(forseqs):
     PsP = PsP[~PsP["SITE_+/-7_AA"].str.contains("X")]
     refseqs = list(PsP["SITE_+/-7_AA"])
     len_bg = int(len(refseqs))
-    backg_pYn, _, _, _ = CountPsiteTypes(refseqs, 7)
+    backg_pYn, _, _ = CountPsiteTypes(refseqs)
 
     # Make sure there are enough pY peptides to meet proportions
     if backg_pYn >= len_bg * pYf:
@@ -126,9 +97,12 @@ def BackgroundSeqs(forseqs):
     return bg_seqs
 
 
-def BackgProportions(refseqs, pYn, pSn, pTn):
+def BackgProportions(refseqs: list[str], pYn: int, pSn: int, pTn: int) -> list[str]:
     """Provided the proportions, add peptides to background set."""
-    y_seqs, s_seqs, t_seqs = [], [], []
+    y_seqs: list[str] = []
+    s_seqs: list[str] = []
+    t_seqs: list[str] = []
+
     pR = ["y", "t", "s"]
     for seq in refseqs:
         if seq[7] not in pR:
@@ -144,50 +118,54 @@ def BackgProportions(refseqs, pYn, pSn, pTn):
         ), "Wrong central AA in background set. Sliced: %s, Full: %s" % (motif, seq)
 
         if motif[5] == "Y" and len(y_seqs) < pYn:
-            y_seqs.append(Seq(motif))
+            y_seqs.append(motif)
 
         if motif[5] == "S" and len(s_seqs) < pSn:
-            s_seqs.append(Seq(motif))
+            s_seqs.append(motif)
 
         if motif[5] == "T" and len(t_seqs) < pTn:
-            t_seqs.append(Seq(motif))
+            t_seqs.append(motif)
 
     return y_seqs + s_seqs + t_seqs
 
 
 class Binomial:
-    """Create a binomial distance distribution."""
+    """Definition of the binomial sequence distance distribution."""
 
-    def __init__(self, seq, seqs):
+    def __init__(self, seq: pd.Series, seqs: list[str]):
         # Background sequences
         background = position_weight_matrix(BackgroundSeqs(seq))
-        self.background = (
-            np.array([background[AA] for AA in AAlist]),
-            GenerateBinarySeqID(seqs),
-        )
+        self.background = np.array([background[AA] for AA in AAlist])
+        self.foreground: np.ndarray = GenerateBinarySeqID(seqs)
 
         self.logWeights = 0.0
-        assert np.all(np.isfinite(self.background[0]))
-        assert np.all(np.isfinite(self.background[1]))
+        assert np.all(np.isfinite(self.background))
+        assert np.all(np.isfinite(self.foreground))
 
-    def from_summaries(self, weightsIn):
+    def from_summaries(self, weightsIn: np.ndarray):
         """Update the underlying distribution."""
-        k = np.einsum("kji,kl->lji", self.background[1], weightsIn)
+        k = np.einsum("kji,kl->lji", self.foreground, weightsIn)
         betaA = np.sum(weightsIn, axis=0)[:, None, None] - k
         betaA = np.clip(betaA, 0.001, np.inf)
-        probmat = sc.betainc(betaA, k + 1, 1 - self.background[0])
-        tempp = np.einsum("ijk,ljk->il", self.background[1], probmat)
+        probmat = sc.betainc(betaA, k + 1, 1 - self.background)
+        tempp = np.einsum("ijk,ljk->il", self.foreground, probmat)
         self.logWeights = np.log(tempp)
 
 
-def CountPsiteTypes(X, cA):
-    """Count number of different phosphorylation types in a MS data set."""
+def CountPsiteTypes(X: list[str]) -> tuple[int, int, int]:
+    """Count the number of different phosphorylation types in an MS data set.
+
+    Args:
+        X (list[str]): The list of peptide sequences.
+
+    Returns:
+        tuple[int, int, int]: The number of pY, pS, and pT sites.
+    """
+    # Find the center amino acid
+    cA = int((len(X[0]) - 1) / 2)
+
     positionSeq = [seq[cA] for seq in X]
     pS = positionSeq.count("s")
     pT = positionSeq.count("t")
     pY = positionSeq.count("y")
-
-    countt = [sum(map(str.islower, seq)) for seq in X]
-    primed = sum(map(lambda i: i > 1, countt))
-
-    return pY, pS, pT, primed
+    return pY, pS, pT
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -14,17 +14,14 @@
 from fancyimpute import SoftImpute
 
 
-# pylint: disable=W0201
-
-
 class DDMC(GaussianMixture):
     """Cluster peptides by both sequence similarity and data behavior following an
     expectation-maximization algorithm. SeqWeight specifies which method's expectation step
     should have a larger effect on the peptide assignment."""
 
     def __init__(
         self,
-        info: pd.DataFrame,
+        sequences: pd.Series,
         n_components: int,
         SeqWeight: float,
         distance_method: Literal["PAM250", "Binomial"],
@@ -39,20 +36,20 @@ def __init__(
             random_state=random_state,
         )
 
-        self.info = info
+        self.sequences = sequences
         self.SeqWeight = SeqWeight
         self.distance_method = distance_method
 
-        seqs = [s.upper() for s in info["Sequence"]]
+        seqs = sequences.str.upper().to_list()
 
         if distance_method == "PAM250":
             self.seqDist: PAM250 | Binomial = PAM250(seqs)
         elif distance_method == "Binomial":
-            self.seqDist = Binomial(info["Sequence"], seqs)
+            self.seqDist = Binomial(sequences, seqs)
         else:
             raise ValueError("Wrong distance type.")
 
-    def _estimate_log_prob(self, X):
+    def _estimate_log_prob(self, X: np.ndarray):
         """Estimate the log-probability of each point in each cluster."""
         logp = super()._estimate_log_prob(X)  # Do the regular work
 
@@ -62,7 +59,7 @@ def _estimate_log_prob(self, X):
 
         return logp
 
-    def _m_step(self, X, log_resp):
+    def _m_step(self, X: np.ndarray, log_resp: np.ndarray):
         """M step.
         Parameters
         ----------
@@ -86,7 +83,7 @@ def _m_step(self, X, log_resp):
         # Do sequence m step
         self.seqDist.from_summaries(np.exp(log_resp))
 
-    def fit(self, X, y=None):
+    def fit(self, X: np.ndarray | pd.DataFrame, y=None):
         """Compute EM clustering"""
         d = np.array(X.T)
 
@@ -132,12 +129,12 @@ def wins(self, X):
 
         return (dataDist, seqDist)
 
-    def transform(self):
+    def transform(self) -> np.ndarray:
         """Calculate cluster averages."""
         check_is_fitted(self, ["means_"])
         return self.means_.T
 
-    def impute(self, X):
+    def impute(self, X: np.ndarray) -> np.ndarray:
         """Impute a matching dataset."""
         X = X.copy()
         labels = self.labels()  # cluster assignments
@@ -157,7 +154,7 @@ def pssms(self, PsP_background=False):
         """
         pssms, cl_num = [], []
         if PsP_background:
-            bg_seqs = BackgroundSeqs(self.info["Sequence"])
+            bg_seqs = BackgroundSeqs(self.sequences)
             back_pssm = compute_control_pssm(bg_seqs)
         else:
             back_pssm = np.zeros((len(AAlist), 11), dtype=float)
@@ -171,7 +168,7 @@ def pssms(self, PsP_background=False):
 
             # Compute PSSM
             pssm = np.zeros((len(AAlist), 11), dtype=float)
-            for jj, seq in enumerate(self.info["Sequence"]):
+            for jj, seq in enumerate(self.sequences):
                 seq = seq.upper()
                 for kk, aa in enumerate(seq):
                     pssm[AAlist.index(aa), kk] += self.scores_[jj, ii - 1]
@@ -187,8 +184,9 @@ def pssms(self, PsP_background=False):
                     back_pssm[:, pos] /= np.mean(back_pssm[:, pos])
 
             # Normalize to background PSSM to account for AA frequencies per position
-            with np.seterr(divide='ignore', invalid='ignore'):
-                pssm /= back_pssm.copy()
+            old_settings = np.seterr(divide="ignore", invalid="ignore")
+            pssm /= back_pssm.copy()
+            np.seterr(**old_settings)
 
             # Log2 transform
             pssm = np.ma.log2(pssm)
@@ -198,7 +196,7 @@ def pssms(self, PsP_background=False):
             pssm.index = AAlist
 
             # Normalize phosphoacceptor position to frequency
-            df = pd.DataFrame(self.info["Sequence"].str.upper())
+            df = pd.DataFrame(self.sequences.str.upper())
             df["Cluster"] = self.labels()
             clSeq = df[df["Cluster"] == ii]["Sequence"]
             clSeq = pd.DataFrame(frequencies(clSeq)).T
@@ -236,24 +234,24 @@ def predict_UpstreamKinases(
         table.insert(0, "Kinase", list(PSPLdict().keys()))
         return table
 
-    def predict(self):
+    def predict(self) -> np.ndarray:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""
         check_is_fitted(self, ["scores_"])
         return np.argmax(self.scores_, axis=1)
 
-    def labels(self):
+    def labels(self) -> np.ndarray:
         """Find cluster assignment with highest likelihood for each peptide."""
         return self.predict() + 1
 
-    def score(self):
+    def score(self) -> float:
         """Generate score of the fitting."""
         check_is_fitted(self, ["lower_bound_"])
         return self.lower_bound_
 
-    def get_params(self, deep=True):
+    def get_params(self, deep: bool = True):
         """Returns a dict of the estimator parameters with their values."""
         dictt = super().get_params(deep=deep)
-        dictt["info"] = self.info
+        dictt["sequences"] = self.sequences
         dictt["SeqWeight"] = self.SeqWeight
         dictt["distance_method"] = self.distance_method
         return dictt