Skip to content

Commit

Permalink
move ebdt to datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 2, 2024
1 parent 3ef804f commit 88f6e68
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 141 deletions.
70 changes: 65 additions & 5 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import re
from pathlib import Path

import numpy as np
import pandas as pd
from typing import Sequence

from pathlib import Path
from ddmc.motifs import DictProteomeNameToSeq

DATA_DIR = Path(__file__).parent / "data"

Expand Down Expand Up @@ -45,7 +48,7 @@ def select_peptide_subset(
class CPTAC:
data_dir = DATA_DIR / "MS" / "CPTAC"

def load_sample_to_experiment(self, as_df=False):
def get_sample_to_experiment(self, as_df=False):
sample_to_experiment = pd.read_csv(self.data_dir / "IDtoExperiment.csv")
if as_df:
return sample_to_experiment
Expand All @@ -60,7 +63,7 @@ def get_p_signal(self) -> pd.DataFrame:
return filter_incomplete_peptides(
p_signal,
min_experiments=2,
sample_to_experiment=self.load_sample_to_experiment(),
sample_to_experiment=self.get_sample_to_experiment(),
)

def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
Expand All @@ -72,7 +75,7 @@ def get_patients_with_nat_and_tumor(self, samples: np.ndarray[str]):
nat_patients = np.char.replace(nat_samples, ".N", "")
return np.intersect1d(tumor_patients, nat_patients)

def get_mutations(self, mutation_names: Sequence[str]=None):
def get_mutations(self, mutation_names: Sequence[str] = None):
mutations = pd.read_csv(self.data_dir / "Patient_Mutations.csv")
mutations = mutations.set_index("Sample.ID")
patients = self.get_patients_with_nat_and_tumor(mutations.index.values)
Expand All @@ -94,6 +97,63 @@ def get_hot_cold_labels(self):
hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
hot_cold = hot_cold.dropna()
return np.squeeze(hot_cold).astype(bool)

def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
return ~np.array([sample.endswith(".N") for sample in samples])


# MCF7 mass spec data set from EBDT (Hijazi et al Nat Biotech 2020)
class EBDT:
def get_p_signal(self) -> pd.DataFrame:
"""Preprocess"""
p_signal = (
pd.read_csv(DATA_DIR / "Validations" / "Computational" / "ebdt_mcf7.csv")
.drop("FDR", axis=1)
.set_index("sh.index.sites")
.drop("ARPC2_HUMAN;")
.reset_index()
)
p_signal.insert(
0, "Gene", [s.split("(")[0] for s in p_signal["sh.index.sites"]]
)
p_signal.insert(
1,
"Position",
[
re.search(r"\(([A-Za-z0-9]+)\)", s).group(1)
for s in p_signal["sh.index.sites"]
],
)
p_signal = p_signal.drop("sh.index.sites", axis=1)
motifs, del_ids = self.pos_to_motif(p_signal["Gene"], p_signal["Position"])
p_signal = p_signal.set_index(["Gene", "Position"]).drop(del_ids).reset_index()
p_signal.insert(0, "Sequence", motifs)
p_signal = p_signal.drop(columns=["Gene", "Position"])
p_signal = p_signal.set_index("Sequence")
return p_signal

def pos_to_motif(self, genes, pos):
"""Map p-site sequence position to uniprot's proteome and extract motifs."""
proteome = open(DATA_DIR / "Sequence_analysis" / "proteome_uniprot2019.fa", "r")
motif_size = 5
ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
motifs = []
del_GeneToPos = []
for gene, pos in list(zip(genes, pos)):
try:
UP_seq = ProteomeDict[gene]
except BaseException:
del_GeneToPos.append([gene, pos])
continue
idx = int(pos[1:]) - 1
motif = list(UP_seq[max(0, idx - motif_size) : idx + motif_size + 1])
if (
len(motif) != motif_size * 2 + 1
or pos[0] != motif[motif_size]
or pos[0] not in ["S", "T", "Y"]
):
del_GeneToPos.append([gene, pos])
continue
motif[motif_size] = motif[motif_size].lower()
motifs.append("".join(motif))
return motifs, del_GeneToPos
71 changes: 29 additions & 42 deletions ddmc/figures/figureM2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This creates Figure 2: Evaluation of Imputating Missingness
"""

import numpy as np
from scipy.stats import gmean
import pandas as pd
Expand All @@ -9,6 +10,7 @@
from ..clustering import DDMC
from ..pre_processing import filter_NaNpeptides
from fancyimpute import IterativeSVD
from ddmc.datasets import CPTAC


def makeFigure():
Expand All @@ -18,8 +20,8 @@ def makeFigure():

# diagram explaining reconstruction process
ax[0].axis("off")
n_clusters = np.arange(1, 46, 45)

n_clusters = np.arange(1, 46, 45)

# Imputation error across Cluster numbers
dataC_W0 = run_repeated_imputation(
Expand Down Expand Up @@ -47,7 +49,7 @@ def makeFigure():
)
plot_imputation_errs(ax[4], dataW_2C, "Weight", legend=False)
ax[4].set_ylim(10.5, 12)

dataW_20C = run_repeated_imputation(
"Binomial", weights=weights, n_clusters=[20] * len(weights), n_runs=1
)
Expand Down Expand Up @@ -76,8 +78,6 @@ def plot_imputation_errs(ax, data, kind, legend=True):
gm["Zero"] = np.log(data.groupby([kind]).Zero.apply(gmean).values)
gm["PCA"] = np.log(data.groupby([kind]).PCA.apply(gmean).values)

gm.to_csv("WeightSearch.csv")

sns.regplot(
x=kind,
y="DDMC",
Expand Down Expand Up @@ -110,29 +110,12 @@ def plot_imputation_errs(ax, data, kind, legend=True):
ax.legend().remove()


def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=6):
def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1):
"""Calculate missingness error across different numbers of clusters and/or weights."""
assert len(weights) == len(n_clusters)
X_raw = filter_NaNpeptides(
pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotifs.csv").iloc[:, 1:],
tmt=tmt,
)
# reset index
X_raw.reset_index(drop=True, inplace=True)

info_cols = ["Sequence", "Protein", "Gene", "Position"]
sample_cols = [col for col in X_raw.columns if col not in info_cols]
sequences = X_raw["Sequence"].copy()
X = X_raw[sample_cols].copy()

# the condition in which each sample was collected
sample_to_condition_df = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv")
assert all(
sample_to_condition_df.iloc[:, 0] == X.columns
), "Sample labels don't match."
X = X.to_numpy()
sample_to_condition = sample_to_condition_df["Experiment (TMT10plex)"].to_numpy()
assert X.shape[1] == sample_to_condition.size
cptac = CPTAC()
p_signal = cptac.get_p_signal()
sample_to_experiment = cptac.get_sample_to_experiment()

df = pd.DataFrame(
columns=[
Expand All @@ -148,15 +131,15 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=
)

for ii in range(n_runs):
X_miss = add_missingness(X, sample_to_condition)
X_miss = add_missingness(p_signal, sample_to_experiment)
baseline_imputations = [
impute_mean(X_miss),
impute_zero(X_miss),
impute_min(X_miss),
impute_pca(X, 5),
impute_pca(X_miss, 5),
]
baseline_errs = [
imputation_error(X, X_impute) for X_impute in baseline_imputations
imputation_error(p_signal, X_impute) for X_impute in baseline_imputations
]

for jj, cluster in enumerate(n_clusters):
Expand All @@ -165,20 +148,23 @@ def run_repeated_imputation(distance_method, weights, n_clusters, n_runs=1, tmt=
cluster,
weights[jj],
imputation_error(
X, impute_ddmc(X, sequences, cluster, weights[jj], distance_method)
p_signal,
impute_ddmc(p_signal, cluster, weights[jj], distance_method),
),
*baseline_errs,
]
return df


def add_missingness(X, sample_to_experiment):
def add_missingness(p_signal, sample_to_experiment):
"""Remove a random TMT experiment for each peptide."""
X = X.copy()
for ii in range(X.shape[0]):
tmtNum = sample_to_experiment[np.isfinite(X[ii, :])]
X[ii, sample_to_experiment == np.random.choice(np.unique(tmtNum))] = np.nan
return X
p_signal = p_signal.copy()
for ii in range(p_signal.shape[0]):
experiments = sample_to_experiment[np.isfinite(p_signal[ii, :])]
p_signal[
ii, sample_to_experiment == np.random.choice(np.unique(experiments))
] = np.nan
return p_signal


def imputation_error(X, X_impute):
Expand All @@ -195,7 +181,7 @@ def impute_zero(X):


def impute_min(X):
X = X.copy()
X = X.copy()
np.copyto(X, np.nanmin(X, axis=0, keepdims=True), where=np.isnan(X))
return X

Expand All @@ -211,8 +197,9 @@ def impute_pca(X, rank):
return IterativeSVD(rank=rank, verbose=False).fit_transform(X)


def impute_ddmc(X, sequences, n_clusters, weight, distance_method):
return DDMC(sequences, n_clusters, weight, distance_method, max_iter=1, tol=0.1).fit(X).impute(X)


makeFigure()
def impute_ddmc(p_signal, n_clusters, weight, distance_method):
return (
DDMC(n_clusters, weight, distance_method, max_iter=1)
.fit(p_signal)
.impute(p_signal)
)
Loading

0 comments on commit 88f6e68

Please sign in to comment.