delete many things

meyer-lab · Feb 2, 2024 · f6a0b34 · f6a0b34
1 parent 54d3779
commit f6a0b34
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 229 deletions.
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -16,8 +16,7 @@
 
 class DDMC(GaussianMixture):
     """Cluster peptides by both sequence similarity and condition-wise phosphorylation following an
-    expectation-maximization algorithm. SeqWeight specifies which method's expectation step
-    should have a larger effect on the peptide assignment."""
+    expectation-maximization algorithm."""
 
     def __init__(
         self,

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from typing import Sequence
 
-from ddmc.motifs import DictProteomeNameToSeq
+from ddmc.motifs import get_proteome_name_to_seq
 
 DATA_DIR = Path(__file__).parent / "data"
 
@@ -136,7 +136,7 @@ def pos_to_motif(self, genes, pos):
         """Map p-site sequence position to uniprot's proteome and extract motifs."""
         proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r")
         motif_size = 5
-        ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
+        ProteomeDict = get_proteome_name_to_seq(proteome, n="gene")
         motifs = []
         del_GeneToPos = []
         for gene, pos in list(zip(genes, pos)):

diff --git a/ddmc/logistic_regression.py b/ddmc/logistic_regression.py
@@ -11,6 +11,7 @@
 from sklearn.metrics import RocCurveDisplay
 from sklearn.model_selection import StratifiedKFold, RepeatedKFold
 
+
 def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
     """Plot LR coeficients of clusters."""
     coefs_ = pd.DataFrame(lr.coef_.T, columns=["LR Coefficient"])
@@ -20,8 +21,6 @@ def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
         hue = "Sample"
     else:
         coefs_["Cluster"] = np.arange(coefs_.shape[0])
-    if xlabels is not None:
-        coefs_["Cluster"] = xlabels
     p = sns.barplot(
         ax=ax,
         x="Cluster",
@@ -38,36 +37,6 @@ def plot_cluster_regression_coefficients(ax: Axes, lr, hue=None, title=False):
         ax.set_title(title)
 
 
-def plotPredictionProbabilities(ax, lr, dd, yy):
-    """Plot LR predictions and prediction probabilities."""
-    res_ = pd.DataFrame()
-    res_["y, p(x)"] = lr.predict_proba(dd)[:, 1]
-    z = lr.predict(dd) == yy
-    res_["Correct_Prediction"] = z.values
-    res_["Prediction"] = lr.predict(dd).astype("int")
-    res_["Patients"] = np.arange(res_.shape[0]) + 1
-    sns.scatterplot(
-        ax=ax, x="Patients", y="Prediction", data=res_, hue="Correct_Prediction"
-    )
-    sns.lineplot(ax=ax, x="Patients", y="y, p(x)", data=res_, marker="s", color="gray")
-    ax.axhline(0.5, ls="--", color="r")
-
-
-def plotConfusionMatrix(ax, lr, dd, yy):
-    """Actual vs predicted outputs"""
-    cm = confusion_matrix(yy, lr.predict(dd))
-    n = lr.classes_.shape[0]
-    ax.imshow(cm)
-    ax.grid(False)
-    ax.set_xlabel("Predicted outputs", color="black")
-    ax.set_ylabel("Actual outputs", color="black")
-    ax.xaxis.set(ticks=range(n))
-    ax.yaxis.set(ticks=range(n))
-    for i in range(n):
-        for j in range(n):
-            ax.text(j, i, cm[i, j], ha="center", va="center", color="white")
-
-
 def plot_roc(
     classifier,
     X: np.ndarray,

diff --git a/ddmc/motifs.py b/ddmc/motifs.py
@@ -9,29 +9,7 @@
 from .binomial import AAlist
 
 
-def MapMotifs(X, names):
-    """Generate pY motifs for pre-processing."""
-    names, seqs, pXpos, Xidx = GeneratingKinaseMotifs(names, FormatSeq(X))
-    X = X.iloc[Xidx, :]
-    X["Gene"] = names
-    X["Sequence"] = seqs
-    X.insert(3, "Position", pXpos)
-    return X[~X["Sequence"].str.contains("-")]
-
-
-def FormatName(X):
-    """Keep only the general protein name, without any other accession information"""
-    full = [v.split("OS")[0].strip() for v in X.iloc[:, 0]]
-    gene = [v.split("GN=")[1].split(" PE")[0].strip() for v in X.iloc[:, 0]]
-    return full, gene
-
-
-def FormatSeq(X):
-    """Deleting -1/-2 for mapping to uniprot's proteome"""
-    return [v.split("-")[0] for v in X["Sequence"]]
-
-
-def DictProteomeNameToSeq(X, n):
+def get_proteome_name_to_seq(X, n):
     """To generate proteom's dictionary"""
     DictProtToSeq_UP = {}
     for rec2 in SeqIO.parse(X, "fasta"):
@@ -48,17 +26,17 @@ def DictProteomeNameToSeq(X, n):
     return DictProtToSeq_UP
 
 
-def getKeysByValue(dictOfElements, valueToFind):
+def get_keys_by_value(dictionary, value):
     """Find the key of a given value within a dictionary."""
     listOfKeys = list()
-    listOfItems = dictOfElements.items()
+    listOfItems = dictionary.items()
     for item in listOfItems:
-        if valueToFind in item[1]:
+        if value in item[1]:
             listOfKeys.append(item[0])
     return listOfKeys
 
 
-def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
+def match_protein_names(ProteomeDict, MS_names, MS_seqs):
     """Match protein names of MS and Uniprot's proteome."""
     matchedNames, seqs, Xidx = [], [], []
     counter = 0
@@ -71,7 +49,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
             matchedNames.append(MS_name)
         else:
             try:
-                newname = getKeysByValue(ProteomeDict, MS_seqU)[0]
+                newname = get_keys_by_value(ProteomeDict, MS_seqU)[0]
                 assert MS_seqU in ProteomeDict[newname]
                 Xidx.append(i)
                 seqs.append(MS_seq)
@@ -86,7 +64,7 @@ def MatchProtNames(ProteomeDict, MS_names, MS_seqs):
     return matchedNames, seqs, Xidx
 
 
-def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
+def find_motif(MS_seq, MS_name, ProteomeDict, motif_size):
     """For a given MS peptide, finds it in the ProteomeDict, and maps the +/-5 AA from the p-site, accounting
     for peptides phosphorylated multiple times concurrently."""
     MS_seqU = MS_seq.upper()
@@ -114,7 +92,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
             elif "t" in MS_seq or "s" in MS_seq:
                 DoS_idx = list(re.compile("y|t|s").finditer(MS_seq))
                 assert len(DoS_idx) != 0
-            mappedMotif, pidx = makeMotif(
+            mappedMotif, pidx = make_motif(
                 UP_seq, MS_seq, motif_size, y_idx, center_idx, DoS_idx
             )
             if len(pidx) == 1:
@@ -130,7 +108,7 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
             DoS_idx = None
             if len(pTS_idx) > 1:
                 DoS_idx = pTS_idx[1:]
-            mappedMotif, pidx = makeMotif(
+            mappedMotif, pidx = make_motif(
                 UP_seq, MS_seq, motif_size, ts_idx, center_idx, DoS_idx
             )
             if len(pidx) == 1:
@@ -145,12 +123,12 @@ def findmotif(MS_seq, MS_name, ProteomeDict, motif_size):
     return pos, mappedMotif
 
 
-def GeneratingKinaseMotifs(names, seqs):
+def generate_kinase_motifs(names, seqs):
     """Main function to generate motifs using 'findmotif'."""
     motif_size = 5
     proteome = open("./data/Sequence_analysis/proteome_uniprot2019.fa", "r")
-    ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
-    protnames, seqs, Xidx = MatchProtNames(ProteomeDict, names, seqs)
+    ProteomeDict = get_proteome_name_to_seq(proteome, n="gene")
+    protnames, seqs, Xidx = match_protein_names(ProteomeDict, names, seqs)
     (
         MS_names,
         mapped_motifs,
@@ -162,7 +140,7 @@ def GeneratingKinaseMotifs(names, seqs):
     )
 
     for i, MS_seq in enumerate(seqs):
-        pos, mappedMotif = findmotif(MS_seq, protnames[i], ProteomeDict, motif_size)
+        pos, mappedMotif = find_motif(MS_seq, protnames[i], ProteomeDict, motif_size)
         MS_names.append(protnames[i])
         mapped_motifs.append(mappedMotif)
         uni_pos.append(pos)
@@ -171,7 +149,7 @@ def GeneratingKinaseMotifs(names, seqs):
     return MS_names, mapped_motifs, uni_pos, Xidx
 
 
-def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx):
+def make_motif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_idx):
     """Make a motif out of the matched sequences."""
     UP_seq_copy = list(
         UP_seq[max(0, ps_protein_idx - motif_size) : ps_protein_idx + motif_size + 1]
@@ -212,29 +190,6 @@ def makeMotif(UP_seq, MS_seq, motif_size, ps_protein_idx, center_motif_idx, DoS_
     return "".join(UP_seq_copy), pidx
 
 
-def preprocess_seqs(X, pYTS):
-    """Filter out any sequences with different than the specified central p-residue
-    and/or any containing gaps."""
-    X = X[~X["Sequence"].str.contains("-")]
-
-    Xidx = []
-    for seq in X["Sequence"]:
-        Xidx.append(seq[5] == pYTS.lower())
-    return X.iloc[Xidx, :]
-
-
-def ForegroundSeqs(sequences):
-    """Build Background data set for either "Y", "S", or "T"."""
-    seqs = []
-    yts = ["Y", "T", "S"]
-    for motif in sequences:
-        motif = motif.upper()
-        assert "-" not in motif, "gap in motif"
-        assert motif[5] in yts, "WRONG CENTRAL AMINO ACID"
-        seqs.append(Seq(motif, alphabet=AAlist))
-    return seqs
-
-
 def get_pspls() -> tuple[np.ndarray, np.ndarray]:
     """Generate dictionary with kinase name-specificity profile pairs"""
     pspls_arr = []

diff --git a/ddmc/pam250.py b/ddmc/pam250.py
@@ -37,9 +37,7 @@ def MotifPam250Scores(seqs: list[str]) -> np.ndarray:
 def distanceCalc(seqs, pam250m: np.ndarray):
     """Calculate all the pairwise distances."""
     # WARNING this type can only hold -128 to 127
-
     out = np.zeros((seqs.shape[0], seqs.shape[0]), dtype=np.int8)
     i_idx, j_idx = np.tril_indices(seqs.shape[0])
     out[i_idx, j_idx] = np.sum(pam250m[seqs[i_idx], seqs[j_idx]], axis=1)
-
     return out
diff --git a/ddmc/pre_processing.py b/ddmc/pre_processing.py