remake figure 7

meyer-lab · Feb 2, 2024 · d0fb555 · d0fb555
1 parent 83ee890
commit d0fb555
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 105 deletions.
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -97,7 +97,9 @@ def fit(self, p_signal: pd.DataFrame):
         # TODO: probably just pass in sequences here
         assert isinstance(p_signal, pd.DataFrame)
         sequences = p_signal.index.values
-        assert isinstance(sequences[0], str) and len(sequences[0]) == 11, "The index of p_signal must be the peptide sequences of length 11"
+        assert (
+            isinstance(sequences[0], str) and len(sequences[0]) == 11
+        ), "The index of p_signal must be the peptide sequences of length 11"
         self.p_signal = p_signal
         self.gen_peptide_distances(sequences, self.distance_method)
 
@@ -156,7 +158,11 @@ def transform(self, as_df=False) -> np.ndarray | pd.DataFrame:
         check_is_fitted(self, ["means_"])
         centers = self.means_.T
         if as_df:
-            centers = pd.DataFrame(centers, index=self.p_signal.columns, columns=np.arange(self.n_components))
+            centers = pd.DataFrame(
+                centers,
+                index=self.p_signal.columns,
+                columns=np.arange(self.n_components),
+            )
         return centers
 
     def impute(self, X: np.ndarray) -> np.ndarray:
@@ -172,7 +178,7 @@ def impute(self, X: np.ndarray) -> np.ndarray:
         assert np.all(np.isfinite(X))
         return X
 
-    def get_pssms(self, PsP_background=False, clusters: List=None):
+    def get_pssms(self, PsP_background=False, clusters: List = None):
         """Compute position-specific scoring matrix of each cluster.
         Note, to normalize by amino acid frequency this uses either
         all the sequences in the data set or a collection of random MS phosphosites in PhosphoSitePlus.
@@ -184,12 +190,13 @@ def get_pssms(self, PsP_background=False, clusters: List=None):
         else:
             back_pssm = np.zeros((len(AAlist), 11), dtype=float)
 
-        l1 = list(np.arange(self.n_components) + 1)
+        l1 = list(np.arange(self.n_components))
         l2 = list(set(self.labels()))
         ec = [i for i in l1 + l2 if i not in l1 or i not in l2]
-        for ii in range(1, self.n_components + 1):
+        for ii in range(self.n_components):
             # Check for empty clusters and ignore them, if there are
-            if ii in ec: continue
+            if ii in ec:
+                continue
 
             # Compute PSSM
             pssm = np.zeros((len(AAlist), 11), dtype=float)
@@ -231,11 +238,13 @@ def get_pssms(self, PsP_background=False, clusters: List=None):
 
             pssms.append(np.clip(pssm, a_min=0, a_max=3))
             pssm_names.append(ii)
-        
+
         pssm_names, pssms = np.array(pssm_names), np.array(pssms)
-        
+
         if clusters is not None:
-            return pssms[[np.where(pssm_names == cluster)[0][0] for cluster in clusters]]
+            return pssms[
+                [np.where(pssm_names == cluster)[0][0] for cluster in clusters]
+            ]
 
         return pssm_names, pssms
 
@@ -258,6 +267,10 @@ def predict_upstream_kinases(
 
         return distances
 
+    def has_empty_clusters(self):
+        check_is_fitted(self, ["scores_"])
+        return np.unique(self.labels()).size != self.n_components
+
     def predict(self) -> np.ndarray:
         """Provided the current model parameters, predict the cluster each peptide belongs to."""
         check_is_fitted(self, ["scores_"])
@@ -279,7 +292,7 @@ def get_pspl_pssm_distances(
     """
     Args:
         pspls: kinase specificity profiles of shape (n_kinase, 20, 9)
-        pssms: position-specific scoring matrices of shape (n_pssms, 20, 11) 
+        pssms: position-specific scoring matrices of shape (n_pssms, 20, 11)
     """
     assert pssms.shape[1:3] == (20, 11)
     assert pspls.shape[1:3] == (20, 9)

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
@@ -23,14 +23,14 @@ def filter_incomplete_peptides(
         assert min_experiments is not None
         assert sample_to_experiment is not None
         unique_experiments = np.unique(sample_to_experiment)
-        experiments_grid, StoE_grid = np.meshgrid(
+        experiments_grid, s_to_e_grid = np.meshgrid(
             unique_experiments, sample_to_experiment, indexing="ij"
         )
-        bool_matrix = experiments_grid == StoE_grid
+        bool_matrix = experiments_grid == s_to_e_grid
         present = ~np.isnan(p_signal.values)
         peptide_idx = (present[None, :, :] & bool_matrix[:, None, :]).any(axis=2).sum(
             axis=0
-        ) > min_experiments
+        ) >= min_experiments
     return p_signal.iloc[peptide_idx]
 
 
@@ -79,7 +79,7 @@ def get_mutations(self, mutation_names: Sequence[str]=None):
         mutations = mutations.loc[patients]
         if mutation_names is not None:
             mutations = mutations[mutation_names]
-        return mutations
+        return mutations.astype(bool)
 
     def get_hot_cold_labels(self):
         hot_cold = (
@@ -88,11 +88,12 @@ def get_hot_cold_labels(self):
             .sort_values(by="Sample ID")
             .set_index("Sample ID")
         )["Group"]
+        hot_cold = hot_cold[~hot_cold.index.str.endswith(".N")]
         hot_cold = hot_cold[hot_cold != "NAT enriched"]
         hot_cold = hot_cold.replace("Cold-tumor enriched", 0)
         hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
         hot_cold = hot_cold.dropna()
-        return np.squeeze(hot_cold)
+        return np.squeeze(hot_cold).astype(bool)
 
     def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
         return ~np.array([sample.endswith(".N") for sample in samples])
diff --git a/ddmc/figures/figureM6.py b/ddmc/figures/figureM6.py
@@ -8,15 +8,15 @@
 from statsmodels.stats.multitest import multipletests
 
 from ddmc.clustering import DDMC
-from ddmc.datasets import CPTAC, select_peptide_subset
+from ddmc.datasets import CPTAC
 from ddmc.figures.common import getSetup, plot_cluster_kinase_distances
 from ddmc.logistic_regression import plotROC, plotClusterCoefficients
 
 
 def makeFigure():
     axes, f = getSetup((11, 7), (2, 3), multz={0: 1})
     cptac = CPTAC()
-    p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=0.01)
+    p_signal = cptac.get_p_signal()
     model = DDMC(n_components=30, seq_weight=100, max_iter=10).fit(p_signal)
 
     # Import Genotype data
@@ -52,7 +52,6 @@ def makeFigure():
         ax=axes[0],
         linewidth=0.25,
     )
-    axes[0].legend(prop={"size": 8})
 
     annotation_height = df_violin["p-signal"].max() + 0.02
     for i, pval in enumerate(pvals):

diff --git a/ddmc/figures/figureM7.py b/ddmc/figures/figureM7.py
@@ -1,114 +1,84 @@
-"""
-This creates Figure 7: Tumor infiltrating immune cells
-"""
-
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import textwrap
+from scipy.stats import mannwhitneyu
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.preprocessing import StandardScaler
-from .common import getSetup, plot_distance_to_upstream_kinase
-from .figureM5 import (
-    build_pval_matrix,
-    calculate_mannW_pvals,
-    plot_clusters_binaryfeatures,
-)
-from ..clustering import DDMC
-from ..logistic_regression import plotROC, plotClusterCoefficients
-from ..pre_processing import filter_NaNpeptides
+from statsmodels.stats.multitest import multipletests
 
+from ddmc.clustering import DDMC
+from ddmc.datasets import CPTAC
+from ddmc.figures.common import plot_cluster_kinase_distances, getSetup
+from ddmc.logistic_regression import plotROC, plotClusterCoefficients
 
 def makeFigure():
     """Get a list of the axis objects and create a figure"""
     # Get list of axis objects
-    ax, f = getSetup((11, 7), (2, 3), multz={0: 1})
-
-    # Import signaling data
-    X = filter_NaNpeptides(
-        pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
-        tmt=2,
+    axes, f = getSetup((11, 7), (2, 3), multz={0: 1, 4: 1})
+    cptac = CPTAC()
+    is_hot = cptac.get_hot_cold_labels()
+    p_signal = cptac.get_p_signal()
+    model = DDMC(n_components=30, seq_weight=100, max_iter=10, random_state=5).fit(
+        p_signal
     )
-    d = X.select_dtypes(include=[float]).T
-    i = X["Sequence"]
-
-    # Fit DDMC
-    model = DDMC(
-        i, n_components=30, seq_weight=100, distance_method="Binomial", random_state=5
-    ).fit(d)
-
-    X = pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:]
-    X = filter_NaNpeptides(X, tmt=2)
-    centers = pd.DataFrame(model.transform())
-    centers.columns = np.arange(model.n_components) + 1
-    centers["Patient_ID"] = X.columns[4:]
-    centers = (
-        centers.loc[~centers["Patient_ID"].str.endswith(".N"), :]
-        .sort_values(by="Patient_ID")
-        .set_index("Patient_ID")
+    assert (
+        not model.has_empty_clusters()
+    ), "This plot assumes that every cluster will have at least one peptide. Please rerun with fewer components are more peptides."
+
+    centers = model.transform(as_df=True).loc[is_hot.index]
+
+    pvals = []
+    centers_m = centers[is_hot]
+    centers_wt = centers[~is_hot]
+    for col in centers.columns:
+        pvals.append(mannwhitneyu(centers_m[col], centers_wt[col])[1])
+    pvals = multipletests(pvals)[1]
+
+    # plot tumor vs nat by cluster
+    df_violin = (
+        centers.assign(is_hot=is_hot)
+        .reset_index()
+        .melt(
+            id_vars="is_hot",
+            value_vars=centers.columns,
+            value_name="p-signal",
+            var_name="Cluster",
+        )
     )
-    centers = centers.drop(
-        [14, 24], axis=1
-    )  # Drop clusters 14&24, contain only 1 peptide
 
-    # Import Cold-Hot Tumor data
-    cent1, y = FormatXYmatrices(centers.copy())
-
-    # Normalize
-    cent1 = cent1.T
-    cent1.iloc[:, :] = StandardScaler(with_std=False).fit_transform(cent1.iloc[:, :])
-    cent1 = cent1.T
-
-    # Hypothesis Testing
-    cent1["TI"] = y.values
-    pvals = calculate_mannW_pvals(cent1, "TI", 1, 0)
-    pvals = build_pval_matrix(model.n_components, pvals)
-    cent1["TI"] = cent1["TI"].replace(0, "CTE")
-    cent1["TI"] = cent1["TI"].replace(1, "HTE")
-    plot_clusters_binaryfeatures(cent1, "TI", ax[0], pvals=pvals, loc="lower left")
-    ax[0].legend(loc="lower left", prop={"size": 10})
+    sns.violinplot(
+        data=df_violin,
+        x="Cluster",
+        y="p-signal",
+        hue="is_hot",
+        dodge=True,
+        ax=axes[0],
+        linewidth=0.25,
+    )
 
-    # Logistic Regression
+    centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(centers)
     lr = LogisticRegressionCV(
-        cv=15, solver="saga", n_jobs=-1, penalty="l1", max_iter=10000
+        cv=3, solver="saga", n_jobs=1, penalty="l1", max_iter=10000
     )
-    plotROC(ax[1], lr, cent1.iloc[:, :-1].values, y, cv_folds=4, title="ROC TI")
-    plotClusterCoefficients(
-        ax[2], lr.fit(cent1.iloc[:, :-1], y.values), title="TI weights"
+    plotROC(
+        lr, centers.values, is_hot.values, cv_folds=3, title="ROC TI", return_mAUC=True
     )
+    plotClusterCoefficients(axes[1], lr, title="")
 
-    # plot Upstream Kinases
-    plot_distance_to_upstream_kinase(model, [17, 20, 21], ax[3], num_hits=3)
-
-    return f
+    top_clusters = np.argsort(np.abs(lr.coef_.squeeze()))[-3:]
 
+    #  plot predicted kinases for most weighted clusters
+    distances = model.predict_upstream_kinases()[top_clusters]
 
-def FormatXYmatrices(centers):
-    """Make sure Y matrix has the same matching samples has the signaling centers"""
-    y = (
-        pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv")
-        .dropna(axis=1)
-        .sort_values(by="Sample ID")
+    # plot upstream Kinases
+    plot_cluster_kinase_distances(
+        distances, model.get_pssms(clusters=top_clusters), axes[2], num_hits=2
     )
-    y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID")
-    l1 = list(centers.index)
-    l2 = list(y.index)
-    dif = [i for i in l1 + l2 if i not in l1 or i not in l2]
-    centers = centers.drop(dif)
-
-    # Transform to binary
-    y = y.replace("Cold-tumor enriched", 0)
-    y = y.replace("Hot-tumor enriched", 1)
-    y = np.squeeze(y)
-
-    # Remove NAT-enriched samples
-    centers = centers.drop(y[y == "NAT enriched"].index)
-    y = y.drop(y[y == "NAT enriched"].index).astype(int)
-    assert all(centers.index.values == y.index.values), "Samples don't match"
-    return centers, y
-
+    return f
 
 def plot_ImmuneGOs(cluster, ax, title=False, max_width=25, n=False, loc="best"):
+    # THIS FUNCION IS NOT MAINTAINED
     """Plot immune-related GO"""
     go = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GO_C" + str(cluster) + ".csv")
     im = go[go["GO biological process complete"].str.contains("immune")]