From 88f6e6838d84eacea515b5f6cdea91e591877c50 Mon Sep 17 00:00:00 2001 From: Armaan Abraham Date: Fri, 2 Feb 2024 13:26:47 -0800 Subject: [PATCH] move ebdt to datasets --- ddmc/datasets.py | 70 +++++++++++++++++++-- ddmc/figures/figureM2.py | 71 +++++++++------------ ddmc/figures/figureM3.py | 132 +++++++++++---------------------------- 3 files changed, 132 insertions(+), 141 deletions(-) diff --git a/ddmc/datasets.py b/ddmc/datasets.py index 1f12e12d..f14e66be 100644 --- a/ddmc/datasets.py +++ b/ddmc/datasets.py @@ -1,8 +1,11 @@ +import re +from pathlib import Path + import numpy as np import pandas as pd from typing import Sequence -from pathlib import Path +from ddmc.motifs import DictProteomeNameToSeq DATA_DIR = Path(__file__).parent / "data" @@ -45,7 +48,7 @@ def select_peptide_subset( class CPTAC: data_dir = DATA_DIR / "MS" / "CPTAC" - def load_sample_to_experiment(self, as_df=False): + def get_sample_to_experiment(self, as_df=False): sample_to_experiment = pd.read_csv(self.data_dir / "IDtoExperiment.csv") if as_df: return sample_to_experiment @@ -60,7 +63,7 @@ def get_p_signal(self) -> pd.DataFrame: return filter_incomplete_peptides( p_signal, min_experiments=2, - sample_to_experiment=self.load_sample_to_experiment(), + sample_to_experiment=self.get_sample_to_experiment(), ) def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]): @@ -72,7 +75,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]): nat_patients = np.char.replace(nat_samples, ".N", "") return np.intersect1d(tumor_patients, nat_patients) - def get_mutations(self, mutation_names: Sequence[str]=None): + def get_mutations(self, mutation_names: Sequence[str] = None): mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv") mutations = mutations.set_index("Sample.ID") patients = self.get_patients_with_nat_and_tumor(mutations.index.values) @@ -94,6 +97,63 @@ def get_hot_cold_labels(self): hot_cold = hot_cold.replace("Hot-tumor enriched", 1) hot_cold = hot_cold.dropna() return np.squeeze(hot_cold).astype(bool) - + def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]: return ~np.array([sample.endswith(".N") for sample in samples]) + + +# MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020) +class EBDT: + def get_p_signal(self) -> pd.DataFrame: + """Preprocess""" + p_signal = ( + pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv") + .drop("FDR", axis=1) + .set_index("sh.index.sites") + .drop("ARPC2_HUMAN;") + .reset_index() + ) + p_signal.insert( + 0, "Gene", [s.split("(")[0] for s in p_signal["sh.index.sites"]] + ) + p_signal.insert( + 1, + "Position", + [ + re.search(r"\(([A-Za-z0-9]+)\)", s).group(1) + for s in p_signal["sh.index.sites"] + ], + ) + p_signal = p_signal.drop("sh.index.sites", axis=1) + motifs, del_ids = self.pos_to_motif(p_signal["Gene"], p_signal["Position"]) + p_signal = p_signal.set_index(["Gene", "Position"]).drop(del_ids).reset_index() + p_signal.insert(0, "Sequence", motifs) + p_signal = p_signal.drop(columns=["Gene", "Position"]) + p_signal = p_signal.set_index("Sequence") + return p_signal + + def pos_to_motif(self, genes, pos): + """Map p-site sequence position to uniprot's proteome and extract motifs.""" + proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r") + motif_size = 5 + ProteomeDict = DictProteomeNameToSeq(proteome, n="gene") + motifs = [] + del_GeneToPos = [] + for gene, pos in list(zip(genes, pos)): + try: + UP_seq = ProteomeDict[gene] + except BaseException: + del_GeneToPos.append([gene, pos]) + continue + idx = int(pos[1:]) - 1 + motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1]) + if ( + len(motif) != motif_size * 2 + 1 + or pos[0] != motif[motif_size] + or pos[0] not in ["S", "T", "Y"] + ): + del_GeneToPos.append([gene, pos]) + continue + motif[motif_size] = motif[motif_size].lower() + motifs.append("".join(motif)) + return motifs, del_GeneToPos diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py index ee8a7902..a3068f80 100644 --- a/ddmc/figures/figureM2.py +++ b/ddmc/figures/figureM2.py @@ -1,6 +1,7 @@ """ This creates Figure 2: Evaluation of Imputating Missingness """ + import numpy as np from scipy.stats import gmean import pandas as pd @@ -9,6 +10,7 @@ from ..clustering import DDMC from ..pre_processing import filter_NaNpeptides from fancyimpute import IterativeSVD +from ddmc.datasets import CPTAC def makeFigure(): @@ -18,8 +20,8 @@ def makeFigure(): # diagram explaining reconstruction process ax[0].axis("off") - - n_clusters = np.arange(1, 46, 45) + + n_clusters = np.arange(1, 46, 45) # Imputation error across Cluster numbers dataC_W0 = run_repeated_imputation( @@ -47,7 +49,7 @@ def makeFigure(): ) plot_imputation_errs(ax[4], dataW_2C, "Weight", legend=False) ax[4].set_ylim(10.5, 12) - + dataW_20C = run_repeated_imputation( "Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1 ) @@ -76,8 +78,6 @@ def plot_imputation_errs(ax, data, kind, legend=True): gm["Zero"] = np.log(data.groupby([kind]).Zero.apply(gmean).values) gm["PCA"] = np.log(data.groupby([kind]).PCA.apply(gmean).values) - gm.to_csv("WeightSearch.csv") - sns.regplot( x=kind, y="DDMC", @@ -110,29 +110,12 @@ def plot_imputation_errs(ax, data, kind, legend=True): ax.legend().remove() -def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=6): +def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1): """Calculate missingness error across different numbers of clusters and/or weights.""" assert len(weights) == len(n_clusters) - X_raw = filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:], - tmt=tmt, - ) - # reset index - X_raw.reset_index(drop=True, inplace=True) - - info_cols = ["Sequence", "Protein", "Gene", "Position"] - sample_cols = [col for col in X_raw.columns if col not in info_cols] - sequences = X_raw["Sequence"].copy() - X = X_raw[sample_cols].copy() - - # the condition in which each sample was collected - sample_to_condition_df = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv") - assert all( - sample_to_condition_df.iloc[:, 0] == X.columns - ), "Sample labels don't match." - X = X.to_numpy() - sample_to_condition = sample_to_condition_df["Experiment (TMT10plex)"].to_numpy() - assert X.shape[1] == sample_to_condition.size + cptac = CPTAC() + p_signal = cptac.get_p_signal() + sample_to_experiment = cptac.get_sample_to_experiment() df = pd.DataFrame( columns=[ @@ -148,15 +131,15 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt= ) for ii in range(n_runs): - X_miss = add_missingness(X, sample_to_condition) + X_miss = add_missingness(p_signal, sample_to_experiment) baseline_imputations = [ impute_mean(X_miss), impute_zero(X_miss), impute_min(X_miss), - impute_pca(X, 5), + impute_pca(X_miss, 5), ] baseline_errs = [ - imputation_error(X, X_impute) for X_impute in baseline_imputations + imputation_error(p_signal, X_impute) for X_impute in baseline_imputations ] for jj, cluster in enumerate(n_clusters): @@ -165,20 +148,23 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt= cluster, weights[jj], imputation_error( - X, impute_ddmc(X, sequences, cluster, weights[jj], distance_method) + p_signal, + impute_ddmc(p_signal, cluster, weights[jj], distance_method), ), *baseline_errs, ] return df -def add_missingness(X, sample_to_experiment): +def add_missingness(p_signal, sample_to_experiment): """Remove a random TMT experiment for each peptide.""" - X = X.copy() - for ii in range(X.shape[0]): - tmtNum = sample_to_experiment[np.isfinite(X[ii, :])] - X[ii, sample_to_experiment == np.random.choice(np.unique(tmtNum))] = np.nan - return X + p_signal = p_signal.copy() + for ii in range(p_signal.shape[0]): + experiments = sample_to_experiment[np.isfinite(p_signal[ii, :])] + p_signal[ + ii, sample_to_experiment == np.random.choice(np.unique(experiments)) + ] = np.nan + return p_signal def imputation_error(X, X_impute): @@ -195,7 +181,7 @@ def impute_zero(X): def impute_min(X): - X = X.copy() + X = X.copy() np.copyto(X, np.nanmin(X, axis=0, keepdims=True), where=np.isnan(X)) return X @@ -211,8 +197,9 @@ def impute_pca(X, rank): return IterativeSVD(rank=rank, verbose=False).fit_transform(X) -def impute_ddmc(X, sequences, n_clusters, weight, distance_method): - return DDMC(sequences, n_clusters, weight, distance_method, max_iter=1, tol=0.1).fit(X).impute(X) - - -makeFigure() +def impute_ddmc(p_signal, n_clusters, weight, distance_method): + return ( + DDMC(n_clusters, weight, distance_method, max_iter=1) + .fit(p_signal) + .impute(p_signal) + ) diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py index c0d1b592..47e8a8f5 100644 --- a/ddmc/figures/figureM3.py +++ b/ddmc/figures/figureM3.py @@ -2,22 +2,21 @@ This creates Figure 2: Validations """ -import re import numpy as np import pandas as pd import seaborn as sns -from ..clustering import DDMC -from ..binomial import AAlist -from .common import getSetup -from .common import ( + +from ddmc.clustering import DDMC,compute_control_pssm, get_pspl_pssm_distances +from ddmc.binomial import AAlist +from ddmc.figures.common import +from ddmc.figures.common import ( + getSetup, plot_motifs, plot_cluster_kinase_distances, plot_pca_on_cluster_centers, ) -from ..clustering import compute_control_pssm, get_pspl_pssm_distances -from ..binomial import AAlist -from ..motifs import DictProteomeNameToSeq, get_pspls -from ..pre_processing import filter_NaNpeptides, separate_sequence_and_abundance +from ddmc.datasets import CPTAC, EBDT +from ddmc.motifs import get_pspls def makeFigure(): @@ -38,27 +37,24 @@ def makeFigure(): return f - def plot_fig_3abd(ax_a, ax_b, ax_d): # Import signaling data - x = preprocess_ebdt_mcf7() - seqs, abund = separate_sequence_and_abundance(x) + p_signal = EBDT().get_p_signal() # Fit DDMC - ddmc = DDMC( - seqs, + model = DDMC( n_components=20, seq_weight=5, distance_method="Binomial", random_state=10, max_iter=1, - ).fit(abund) + ).fit(p_signal) # get cluster centers - centers = ddmc.transform(as_df=True) + centers = model.transform(as_df=True) # parse inhibitor names from sample names - inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns] + inhibitors = [s.split(".")[1].split(".")[0] for s in p_signal.columns] # create new bool array specifying whether each inhibitor is an AKT inhibitor is_AKTi = [ @@ -88,8 +84,8 @@ def plot_fig_3abd(ax_a, ax_b, ax_d): # Plot kinase predictions for cluster 16 plot_cluster_kinase_distances( - ddmc.predict_upstream_kinases()[[16]], - ddmc.get_pssms(PsP_background=True, clusters=[16])[0], + model.predict_upstream_kinases()[[16]], + model.get_pssms(PsP_background=True, clusters=[16])[0], ax=ax_d, ) @@ -133,28 +129,19 @@ def plot_fig_3fgh(ax_f, ax_g, ax_h): erk2_pssm.index = AAlist plot_motifs(erk2_pssm, ax=ax_f, titles="ERK2") - # ERK2 prediction - # Import signaling data - seqs, abund = separate_sequence_and_abundance( - filter_NaNpeptides( - pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:], - tmt=2, - ) - ) + p_signal = CPTAC().get_p_signal() - # Fit DDMC - ddmc_cptac = DDMC( - seqs, + model = DDMC( n_components=30, seq_weight=100, distance_method="Binomial", random_state=5, max_iter=1, - ).fit(abund) + ).fit(p_signal) clusters = [3, 7, 21] # get pssms from ddmc clusters - pssms = ddmc_cptac.get_pssms(PsP_background=True, clusters=clusters) + pssms = model.get_pssms(PsP_background=True, clusters=clusters) # append erk2+ pssm pssm_names = clusters + ["ERK2+"] @@ -222,16 +209,24 @@ def shuffle_pssm(pssm): return np.insert(shuffled_pssm, 1, pssm[:, -1], axis=1) -def plot_fig_3c(model, cluster): - """Code to create hierarchical clustering of cluster 1 across treatments""" - c1 = pd.DataFrame(model.transform()[:, cluster - 1]) - X = pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv") - index = [col.split("7.")[1].split(".")[0] for col in X.columns[2:]] - c1["Inhibitor"] = index - c1 = c1.set_index("Inhibitor") - lim = np.max(np.abs(c1)) * 0.3 - g = sns.clustermap( - c1, +def plot_fig_3c(): + """Code to create hierarchical clustering of cluster 0 across treatments""" + p_signal = EBDT().get_p_signal() + model = DDMC( + n_components=20, + seq_weight=5, + distance_method="Binomial", + random_state=10, + max_iter=1, + ).fit(p_signal) + centers = model.transform(as_df=True) + # the labels are structured as "MCF7..fold" + centers.index = [i[5:-5] for i in centers.index] + # first cluster + center = centers.iloc[:, 0] + lim = np.max(np.abs(center)) * 0.3 + sns.clustermap( + center, method="centroid", cmap="bwr", robust=True, @@ -242,55 +237,4 @@ def plot_fig_3c(model, cluster): figsize=(2, 15), yticklabels=True, xticklabels=False, - ) - - -def preprocess_ebdt_mcf7(): - """Preprocess MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)""" - x = ( - pd.read_csv("ddmc/data/Validations/Computational/ebdt_mcf7.csv") - .drop("FDR", axis=1) - .set_index("sh.index.sites") - .drop("ARPC2_HUMAN;") - .reset_index() - ) - x.insert(0, "Gene", [s.split("(")[0] for s in x["sh.index.sites"]]) - x.insert( - 1, - "Position", - [re.search(r"\(([A-Za-z0-9]+)\)", s).group(1) for s in x["sh.index.sites"]], - ) - x = x.drop("sh.index.sites", axis=1) - motifs, del_ids = pos_to_motif(x["Gene"], x["Position"], motif_size=5) - x = x.set_index(["Gene", "Position"]).drop(del_ids).reset_index() - x.insert(0, "Sequence", motifs) - return x - - -def pos_to_motif(genes, pos, motif_size=5): - """Map p-site sequence position to uniprot's proteome and extract motifs.""" - proteome = open("ddmc/data/Sequence_analysis/proteome_uniprot2019.fa", "r") - ProteomeDict = DictProteomeNameToSeq(proteome, n="gene") - motifs = [] - del_GeneToPos = [] - for gene, pos in list(zip(genes, pos)): - try: - UP_seq = ProteomeDict[gene] - except BaseException: - del_GeneToPos.append([gene, pos]) - continue - idx = int(pos[1:]) - 1 - motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1]) - if ( - len(motif) != motif_size * 2 + 1 - or pos[0] != motif[motif_size] - or pos[0] not in ["S", "T", "Y"] - ): - del_GeneToPos.append([gene, pos]) - continue - motif[motif_size] = motif[motif_size].lower() - motifs.append("".join(motif)) - return motifs, del_GeneToPos - - -makeFigure() + ) \ No newline at end of file