From 88f6e6838d84eacea515b5f6cdea91e591877c50 Mon Sep 17 00:00:00 2001
From: Armaan Abraham <armaanabraham@g.ucla.edu>
Date: Fri, 2 Feb 2024 13:26:47 -0800
Subject: [PATCH] move ebdt to datasets

---
 ddmc/datasets.py         |  70 +++++++++++++++++++--
 ddmc/figures/figureM2.py |  71 +++++++++------------
 ddmc/figures/figureM3.py | 132 +++++++++++----------------------------
 3 files changed, 132 insertions(+), 141 deletions(-)

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
index 1f12e12d..f14e66be 100644
--- a/ddmc/datasets.py
+++ b/ddmc/datasets.py
@@ -1,8 +1,11 @@
+import re
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
 from typing import Sequence
 
-from pathlib import Path
+from ddmc.motifs import DictProteomeNameToSeq
 
 DATA_DIR = Path(__file__).parent / "data"
 
@@ -45,7 +48,7 @@ def select_peptide_subset(
 class CPTAC:
     data_dir = DATA_DIR / "MS" / "CPTAC"
 
-    def load_sample_to_experiment(self, as_df=False):
+    def get_sample_to_experiment(self, as_df=False):
         sample_to_experiment = pd.read_csv(self.data_dir / "IDtoExperiment.csv")
         if as_df:
             return sample_to_experiment
@@ -60,7 +63,7 @@ def get_p_signal(self) -> pd.DataFrame:
         return filter_incomplete_peptides(
             p_signal,
             min_experiments=2,
-            sample_to_experiment=self.load_sample_to_experiment(),
+            sample_to_experiment=self.get_sample_to_experiment(),
         )
 
     def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
@@ -72,7 +75,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
         nat_patients = np.char.replace(nat_samples, ".N", "")
         return np.intersect1d(tumor_patients, nat_patients)
 
-    def get_mutations(self, mutation_names: Sequence[str]=None):
+    def get_mutations(self, mutation_names: Sequence[str] = None):
         mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
         mutations = mutations.set_index("Sample.ID")
         patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
@@ -94,6 +97,63 @@ def get_hot_cold_labels(self):
         hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
         hot_cold = hot_cold.dropna()
         return np.squeeze(hot_cold).astype(bool)
-    
+
     def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
         return ~np.array([sample.endswith(".N") for sample in samples])
+
+
+# MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)
+class EBDT:
+    def get_p_signal(self) -> pd.DataFrame:
+        """Preprocess"""
+        p_signal = (
+            pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv")
+            .drop("FDR", axis=1)
+            .set_index("sh.index.sites")
+            .drop("ARPC2_HUMAN;")
+            .reset_index()
+        )
+        p_signal.insert(
+            0, "Gene", [s.split("(")[0] for s in p_signal["sh.index.sites"]]
+        )
+        p_signal.insert(
+            1,
+            "Position",
+            [
+                re.search(r"\(([A-Za-z0-9]+)\)", s).group(1)
+                for s in p_signal["sh.index.sites"]
+            ],
+        )
+        p_signal = p_signal.drop("sh.index.sites", axis=1)
+        motifs, del_ids = self.pos_to_motif(p_signal["Gene"], p_signal["Position"])
+        p_signal = p_signal.set_index(["Gene", "Position"]).drop(del_ids).reset_index()
+        p_signal.insert(0, "Sequence", motifs)
+        p_signal = p_signal.drop(columns=["Gene", "Position"])
+        p_signal = p_signal.set_index("Sequence")
+        return p_signal
+
+    def pos_to_motif(self, genes, pos):
+        """Map p-site sequence position to uniprot's proteome and extract motifs."""
+        proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r")
+        motif_size = 5
+        ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
+        motifs = []
+        del_GeneToPos = []
+        for gene, pos in list(zip(genes, pos)):
+            try:
+                UP_seq = ProteomeDict[gene]
+            except BaseException:
+                del_GeneToPos.append([gene, pos])
+                continue
+            idx = int(pos[1:]) - 1
+            motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1])
+            if (
+                len(motif) != motif_size * 2 + 1
+                or pos[0] != motif[motif_size]
+                or pos[0] not in ["S", "T", "Y"]
+            ):
+                del_GeneToPos.append([gene, pos])
+                continue
+            motif[motif_size] = motif[motif_size].lower()
+            motifs.append("".join(motif))
+        return motifs, del_GeneToPos
diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
index ee8a7902..a3068f80 100644
--- a/ddmc/figures/figureM2.py
+++ b/ddmc/figures/figureM2.py
@@ -1,6 +1,7 @@
 """
 This creates Figure 2: Evaluation of Imputating Missingness
 """
+
 import numpy as np
 from scipy.stats import gmean
 import pandas as pd
@@ -9,6 +10,7 @@
 from ..clustering import DDMC
 from ..pre_processing import filter_NaNpeptides
 from fancyimpute import IterativeSVD
+from ddmc.datasets import CPTAC
 
 
 def makeFigure():
@@ -18,8 +20,8 @@ def makeFigure():
 
     # diagram explaining reconstruction process
     ax[0].axis("off")
-    
-    n_clusters = np.arange(1, 46, 45) 
+
+    n_clusters = np.arange(1, 46, 45)
 
     # Imputation error across Cluster numbers
     dataC_W0 = run_repeated_imputation(
@@ -47,7 +49,7 @@ def makeFigure():
     )
     plot_imputation_errs(ax[4], dataW_2C, "Weight", legend=False)
     ax[4].set_ylim(10.5, 12)
-     
+
     dataW_20C = run_repeated_imputation(
         "Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1
     )
@@ -76,8 +78,6 @@ def plot_imputation_errs(ax, data, kind, legend=True):
     gm["Zero"] = np.log(data.groupby([kind]).Zero.apply(gmean).values)
     gm["PCA"] = np.log(data.groupby([kind]).PCA.apply(gmean).values)
 
-    gm.to_csv("WeightSearch.csv")
-
     sns.regplot(
         x=kind,
         y="DDMC",
@@ -110,29 +110,12 @@ def plot_imputation_errs(ax, data, kind, legend=True):
         ax.legend().remove()
 
 
-def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=6):
+def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
     """Calculate missingness error across different numbers of clusters and/or weights."""
     assert len(weights) == len(n_clusters)
-    X_raw = filter_NaNpeptides(
-        pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:],
-        tmt=tmt,
-    )
-    # reset index
-    X_raw.reset_index(drop=True, inplace=True)
-
-    info_cols = ["Sequence", "Protein", "Gene", "Position"]
-    sample_cols = [col for col in X_raw.columns if col not in info_cols]
-    sequences = X_raw["Sequence"].copy()
-    X = X_raw[sample_cols].copy()
-
-    # the condition in which each sample was collected
-    sample_to_condition_df = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv")
-    assert all(
-        sample_to_condition_df.iloc[:, 0] == X.columns
-    ), "Sample labels don't match."
-    X = X.to_numpy()
-    sample_to_condition = sample_to_condition_df["Experiment (TMT10plex)"].to_numpy()
-    assert X.shape[1] == sample_to_condition.size
+    cptac = CPTAC()
+    p_signal = cptac.get_p_signal()
+    sample_to_experiment = cptac.get_sample_to_experiment()
 
     df = pd.DataFrame(
         columns=[
@@ -148,15 +131,15 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=
     )
 
     for ii in range(n_runs):
-        X_miss = add_missingness(X, sample_to_condition)
+        X_miss = add_missingness(p_signal, sample_to_experiment)
         baseline_imputations = [
             impute_mean(X_miss),
             impute_zero(X_miss),
             impute_min(X_miss),
-            impute_pca(X, 5),
+            impute_pca(X_miss, 5),
         ]
         baseline_errs = [
-            imputation_error(X, X_impute) for X_impute in baseline_imputations
+            imputation_error(p_signal, X_impute) for X_impute in baseline_imputations
         ]
 
         for jj, cluster in enumerate(n_clusters):
@@ -165,20 +148,23 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=
                 cluster,
                 weights[jj],
                 imputation_error(
-                    X, impute_ddmc(X, sequences, cluster, weights[jj], distance_method)
+                    p_signal,
+                    impute_ddmc(p_signal, cluster, weights[jj], distance_method),
                 ),
                 *baseline_errs,
             ]
     return df
 
 
-def add_missingness(X, sample_to_experiment):
+def add_missingness(p_signal, sample_to_experiment):
     """Remove a random TMT experiment for each peptide."""
-    X = X.copy()
-    for ii in range(X.shape[0]):
-        tmtNum = sample_to_experiment[np.isfinite(X[ii, :])]
-        X[ii, sample_to_experiment == np.random.choice(np.unique(tmtNum))] = np.nan
-    return X
+    p_signal = p_signal.copy()
+    for ii in range(p_signal.shape[0]):
+        experiments = sample_to_experiment[np.isfinite(p_signal[ii, :])]
+        p_signal[
+            ii, sample_to_experiment == np.random.choice(np.unique(experiments))
+        ] = np.nan
+    return p_signal
 
 
 def imputation_error(X, X_impute):
@@ -195,7 +181,7 @@ def impute_zero(X):
 
 
 def impute_min(X):
-    X = X.copy() 
+    X = X.copy()
     np.copyto(X, np.nanmin(X, axis=0, keepdims=True), where=np.isnan(X))
     return X
 
@@ -211,8 +197,9 @@ def impute_pca(X, rank):
     return IterativeSVD(rank=rank, verbose=False).fit_transform(X)
 
 
-def impute_ddmc(X, sequences, n_clusters, weight, distance_method):
-    return DDMC(sequences, n_clusters, weight, distance_method, max_iter=1, tol=0.1).fit(X).impute(X)
-
-
-makeFigure()
+def impute_ddmc(p_signal, n_clusters, weight, distance_method):
+    return (
+        DDMC(n_clusters, weight, distance_method, max_iter=1)
+        .fit(p_signal)
+        .impute(p_signal)
+    )
diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py
index c0d1b592..47e8a8f5 100644
--- a/ddmc/figures/figureM3.py
+++ b/ddmc/figures/figureM3.py
@@ -2,22 +2,21 @@
 This creates Figure 2: Validations
 """
 
-import re
 import numpy as np
 import pandas as pd
 import seaborn as sns
-from ..clustering import DDMC
-from ..binomial import AAlist
-from .common import getSetup
-from .common import (
+
+from ddmc.clustering import DDMC,compute_control_pssm, get_pspl_pssm_distances 
+from ddmc.binomial import AAlist
+from ddmc.figures.common import 
+from ddmc.figures.common import (
+    getSetup,
     plot_motifs,
     plot_cluster_kinase_distances,
     plot_pca_on_cluster_centers,
 )
-from ..clustering import compute_control_pssm, get_pspl_pssm_distances
-from ..binomial import AAlist
-from ..motifs import DictProteomeNameToSeq, get_pspls
-from ..pre_processing import filter_NaNpeptides, separate_sequence_and_abundance
+from ddmc.datasets import CPTAC, EBDT
+from ddmc.motifs import get_pspls
 
 
 def makeFigure():
@@ -38,27 +37,24 @@ def makeFigure():
 
     return f
 
-
 def plot_fig_3abd(ax_a, ax_b, ax_d):
     # Import signaling data
-    x = preprocess_ebdt_mcf7()
-    seqs, abund = separate_sequence_and_abundance(x)
+    p_signal = EBDT().get_p_signal()
 
     # Fit DDMC
-    ddmc = DDMC(
-        seqs,
+    model = DDMC(
         n_components=20,
         seq_weight=5,
         distance_method="Binomial",
         random_state=10,
         max_iter=1,
-    ).fit(abund)
+    ).fit(p_signal)
 
     # get cluster centers
-    centers = ddmc.transform(as_df=True)
+    centers = model.transform(as_df=True)
 
     # parse inhibitor names from sample names
-    inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns]
+    inhibitors = [s.split(".")[1].split(".")[0] for s in p_signal.columns]
 
     # create new bool array specifying whether each inhibitor is an AKT inhibitor
     is_AKTi = [
@@ -88,8 +84,8 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
 
     # Plot kinase predictions for cluster 16
     plot_cluster_kinase_distances(
-        ddmc.predict_upstream_kinases()[[16]],
-        ddmc.get_pssms(PsP_background=True, clusters=[16])[0],
+        model.predict_upstream_kinases()[[16]],
+        model.get_pssms(PsP_background=True, clusters=[16])[0],
         ax=ax_d,
     )
 
@@ -133,28 +129,19 @@ def plot_fig_3fgh(ax_f, ax_g, ax_h):
     erk2_pssm.index = AAlist
     plot_motifs(erk2_pssm, ax=ax_f, titles="ERK2")
 
-    # ERK2 prediction
-    # Import signaling data
-    seqs, abund = separate_sequence_and_abundance(
-        filter_NaNpeptides(
-            pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:],
-            tmt=2,
-        )
-    )
+    p_signal = CPTAC().get_p_signal()
 
-    # Fit DDMC
-    ddmc_cptac = DDMC(
-        seqs,
+    model = DDMC(
         n_components=30,
         seq_weight=100,
         distance_method="Binomial",
         random_state=5,
         max_iter=1,
-    ).fit(abund)
+    ).fit(p_signal)
 
     clusters = [3, 7, 21]
     # get pssms from ddmc clusters
-    pssms = ddmc_cptac.get_pssms(PsP_background=True, clusters=clusters)
+    pssms = model.get_pssms(PsP_background=True, clusters=clusters)
 
     # append erk2+ pssm
     pssm_names = clusters + ["ERK2+"]
@@ -222,16 +209,24 @@ def shuffle_pssm(pssm):
     return np.insert(shuffled_pssm, 1, pssm[:, -1], axis=1)
 
 
-def plot_fig_3c(model, cluster):
-    """Code to create hierarchical clustering of cluster 1 across treatments"""
-    c1 = pd.DataFrame(model.transform()[:, cluster - 1])
-    X = pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv")
-    index = [col.split("7.")[1].split(".")[0] for col in X.columns[2:]]
-    c1["Inhibitor"] = index
-    c1 = c1.set_index("Inhibitor")
-    lim = np.max(np.abs(c1)) * 0.3
-    g = sns.clustermap(
-        c1,
+def plot_fig_3c():
+    """Code to create hierarchical clustering of cluster 0 across treatments"""
+    p_signal = EBDT().get_p_signal()
+    model = DDMC(
+        n_components=20,
+        seq_weight=5,
+        distance_method="Binomial",
+        random_state=10,
+        max_iter=1,
+    ).fit(p_signal)
+    centers = model.transform(as_df=True)
+    # the labels are structured as "MCF7.<drug>.fold"
+    centers.index = [i[5:-5] for i in centers.index]
+    # first cluster
+    center = centers.iloc[:, 0]
+    lim = np.max(np.abs(center)) * 0.3
+    sns.clustermap(
+        center,
         method="centroid",
         cmap="bwr",
         robust=True,
@@ -242,55 +237,4 @@ def plot_fig_3c(model, cluster):
         figsize=(2, 15),
         yticklabels=True,
         xticklabels=False,
-    )
-
-
-def preprocess_ebdt_mcf7():
-    """Preprocess MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)"""
-    x = (
-        pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv")
-        .drop("FDR", axis=1)
-        .set_index("sh.index.sites")
-        .drop("ARPC2_HUMAN;")
-        .reset_index()
-    )
-    x.insert(0, "Gene", [s.split("(")[0] for s in x["sh.index.sites"]])
-    x.insert(
-        1,
-        "Position",
-        [re.search(r"\(([A-Za-z0-9]+)\)", s).group(1) for s in x["sh.index.sites"]],
-    )
-    x = x.drop("sh.index.sites", axis=1)
-    motifs, del_ids = pos_to_motif(x["Gene"], x["Position"], motif_size=5)
-    x = x.set_index(["Gene", "Position"]).drop(del_ids).reset_index()
-    x.insert(0, "Sequence", motifs)
-    return x
-
-
-def pos_to_motif(genes, pos, motif_size=5):
-    """Map p-site sequence position to uniprot's proteome and extract motifs."""
-    proteome = open("ddmc/data/Sequence_analysis/proteome_uniprot2019.fa", "r")
-    ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
-    motifs = []
-    del_GeneToPos = []
-    for gene, pos in list(zip(genes, pos)):
-        try:
-            UP_seq = ProteomeDict[gene]
-        except BaseException:
-            del_GeneToPos.append([gene, pos])
-            continue
-        idx = int(pos[1:]) - 1
-        motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1])
-        if (
-            len(motif) != motif_size * 2 + 1
-            or pos[0] != motif[motif_size]
-            or pos[0] not in ["S", "T", "Y"]
-        ):
-            del_GeneToPos.append([gene, pos])
-            continue
-        motif[motif_size] = motif[motif_size].lower()
-        motifs.append("".join(motif))
-    return motifs, del_GeneToPos
-
-
-makeFigure()
+    )
\ No newline at end of file