remake figure 5

meyer-lab · Feb 1, 2024 · b190377 · b190377
1 parent 2f59463
commit b190377
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 185 deletions.
diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -23,7 +23,7 @@ def __init__(
         self,
         n_components: int,
         seq_weight: float,
-        distance_method: Literal["PAM250", "Binomial"],
+        distance_method: Literal["PAM250", "Binomial"] = "Binomial",
         random_state=None,
         max_iter=200,
         tol=1e-4,

diff --git a/ddmc/datasets.py b/ddmc/datasets.py
@@ -91,3 +91,6 @@ def get_hot_cold_labels(self):
         hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
         hot_cold = hot_cold.dropna()
         return np.squeeze(hot_cold)
+
+    def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
+        return ~np.array([sample.endswith(".N") for sample in samples])
diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py
@@ -57,8 +57,7 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
     ).fit(abund)
 
     # get cluster centers
-    centers = pd.DataFrame(ddmc.transform())
-    centers.columns = np.arange(ddmc.n_components) + 1
+    centers = ddmc.transform(as_df=True)
 
     # parse inhibitor names from sample names
     inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns]
@@ -103,7 +102,6 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
     ax_a.set_title("PCA Scores")
     ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10)
     ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10)
-    ax_a.legend(prop={"size": 8})
 
     # plot loadings
     sns.scatterplot(

diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py
@@ -1,68 +1,119 @@
-"""
-This creates Figure 5: Tumor vs NAT analysis
-"""
-
 import numpy as np
-import pandas as pd
 import seaborn as sns
-import textwrap
-import mygene
 from scipy.stats import mannwhitneyu
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
 from statsmodels.stats.multitest import multipletests
-from .common import getSetup, getDDMC_CPTAC
-from ..logistic_regression import plotClusterCoefficients, plotROC
-from .common import plot_distance_to_upstream_kinase, TumorType
-from ..pca import plotPCA
-from ..pre_processing import filter_NaNpeptides
+
+from ddmc.clustering import DDMC
+from ddmc.datasets import CPTAC, select_peptide_subset
+from ddmc.figures.common import getSetup
+from ddmc.figures.common import plot_cluster_kinase_distances
+from ddmc.logistic_regression import plotClusterCoefficients, plotROC
 
 
 def makeFigure():
-    """Get a list of the axis objects and create a figure"""
-    # Get list of axis objects
-    ax, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1})
+    axes, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1})
+    cptac = CPTAC()
+    p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=0.1)
+    model = DDMC(n_components=30, seq_weight=100, max_iter=5).fit(p_signal)
 
-    # Fit DDMC
-    model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0)
-
-    # Normalize
-    centers = pd.DataFrame(model.transform()).T
-    centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
-        centers.iloc[:, :]
-    )
-    centers = centers.T
-    centers.columns = np.arange(model.n_components) + 1
-    centers["Patient_ID"] = X.columns[4:]
-    centers = TumorType(centers).set_index("Patient_ID")
-    centers = centers.drop([14, 24], axis=1)  # Drop cluster 19, contains only 1 peptide
+    centers = model.transform(as_df=True)
+    centers.loc[:, :] = StandardScaler(with_std=False).fit_transform(centers.values)
+    is_tumor = cptac.get_tumor_or_nat(centers.index)
 
     # first plot heatmap of clusters
     # lim = 1.5
     # sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim,  figsize=(15, 9)) Run in notebook and save as svg
-    ax[0].axis("off")
-
-    # PCA and Hypothesis Testing
-    pvals = calculate_mannW_pvals(centers, "Type", "NAT", "Tumor")
-    pvals = build_pval_matrix(model.n_components, pvals)
-    plotPCA(
-        ax[1:3],
-        centers.reset_index(),
-        2,
-        ["Patient_ID", "Type"],
-        "Cluster",
-        hue_scores="Type",
-        style_scores="Type",
-        pvals=pvals.iloc[:, -1].values,
+    axes[0].axis("off")
+
+    # get p value of tumor vs NAT for each cluster
+    pvals = []
+    centers_tumor = centers[is_tumor]
+    centers_nat = centers[~is_tumor]
+    for col in centers.columns:
+        pvals.append(mannwhitneyu(centers_tumor[col], centers_nat[col])[1])
+    pvals = multipletests(pvals)[1]
+
+    # run PCA on cluster centers
+    pca = PCA(n_components=2)
+    scores = pca.fit_transform(centers)  # sample by PCA component
+    loadings = pca.components_  # PCA component by cluster
+    variance_explained = np.round(pca.explained_variance_ratio_, 2)
+
+    # plot scores
+    sns.scatterplot(
+        x=scores[:, 0],
+        y=scores[:, 1],
+        hue=is_tumor,
+        ax=axes[1],
+        **{"linewidth": 0.5, "edgecolor": "k"},
+    )
+    axes[1].legend(loc="lower left", prop={"size": 9}, title="Tumor", fontsize=9)
+    axes[1].set_title("PCA Scores")
+    axes[1].set_xlabel(
+        "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
+    )
+    axes[1].set_ylabel(
+        "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
     )
-    plot_clusters_binaryfeatures(centers, "Type", ax[3], pvals=pvals, loc="lower left")
 
-    # Transform to Binary
-    c = centers.select_dtypes(include=["float64"])
-    tt = centers.iloc[:, -1]
-    tt = tt.replace("NAT", 0)
-    tt = tt.replace("Tumor", 1)
+    # plot loadings
+    sns.scatterplot(
+        x=loadings[0],
+        y=loadings[1],
+        ax=axes[2],
+        hue=pvals < 0.01,
+        **{"linewidth": 0.5, "edgecolor": "k"},
+    )
+    axes[2].set_title("PCA Loadings")
+    axes[2].set_xlabel(
+        "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
+    )
+    axes[2].set_ylabel(
+        "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
+    )
+    axes[2].legend(title="p < 0.01", prop={"size": 8})
+    for j, txt in enumerate(centers.columns):
+        axes[2].annotate(
+            txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
+        )
+
+    # plot tumor vs nat by cluster
+    df_violin = (
+        centers.assign(is_tumor=is_tumor)
+        .reset_index()
+        .melt(
+            id_vars="is_tumor",
+            value_vars=centers.columns,
+            value_name="p-signal",
+            var_name="Cluster",
+        )
+    )
+    sns.violinplot(
+        data=df_violin,
+        x="Cluster",
+        y="p-signal",
+        hue="is_tumor",
+        dodge=True,
+        ax=axes[3],
+        linewidth=0.25,
+    )
+    axes[3].legend(prop={"size": 8})
+
+    annotation_height = df_violin["p-signal"].max() + 0.02
+    for i, pval in enumerate(pvals):
+        if pval < 0.05:
+            annotation = "*"
+        elif pval < 0.01:
+            annotation = "**"
+        else:
+            continue
+        axes[3].text(
+            i, annotation_height, annotation, ha="center", va="bottom", fontsize=10
+        )
 
     # Logistic Regression
     lr = LogisticRegressionCV(
@@ -74,140 +125,16 @@ def makeFigure():
         l1_ratios=[0.85],
         class_weight="balanced",
     )
-    plotROC(ax[4], lr, c.values, tt, cv_folds=4, return_mAUC=False)
-    plotClusterCoefficients(ax[5], lr)
-    ax[5].set_xticklabels(centers.columns[:-1])
+    plotROC(lr, centers.values, is_tumor, cv_folds=4, return_mAUC=False, ax=axes[4])
 
-    # Upstream Kinases
-    plot_distance_to_upstream_kinase(model, [6, 15, 20], ax[6], num_hits=2)
+    plotClusterCoefficients(axes[5], lr)
 
-    return f
+    top_clusters = np.argsort(np.abs(lr.coef_.squeeze()))[-3:]
 
+    # plot predicted kinases for most weighted clusters
+    distances = model.predict_upstream_kinases()[top_clusters]
 
-def make_BPtoGenes_table(X, cluster):
-    d = X[["Clusters", "Description", "geneID"]]
-    d = d[d["Clusters"] == cluster]
-    gAr = d[["geneID"]].values
-    bpAr = d[["Description"]].values
-    mg = mygene.MyGeneInfo()
-    BPtoGenesDict = {}
-    for ii, arr in enumerate(gAr):
-        gg = mg.querymany(
-            list(arr[0].split("/")),
-            scopes="entrezgene",
-            fields="symbol",
-            species="human",
-            returnall=False,
-            as_dataframe=True,
-        )
-        BPtoGenesDict[bpAr[ii][0]] = list(gg["symbol"])
-    return pd.DataFrame(dict([(k, pd.Series(v)) for k, v in BPtoGenesDict.items()]))
-
-
-def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"):
-    """"Plot BPs enriched per cluster""" ""
-    if gene_set == "WP":
-        gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[
-            :, 1:
-        ]
-    elif gene_set == "onco":
-        gsea = pd.read_csv(
-            "ddmc/data/cluster_analysis/CPTAC_GSEA_ONCO_results.csv"
-        ).iloc[:, 1:]
-    elif gene_set == "Immuno":
-        gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[
-            :, 1:
-        ]
-    cc = make_BPtoGenes_table(gsea, cluster)
-    cl = X[X["Cluster"] == cluster].set_index("Gene")
-    dfs = []
-    for ii in range(cc.shape[1]):
-        ss = cl.loc[cc.iloc[:, ii].dropna()].reset_index()
-        ss["Process"] = cc.columns[ii]
-        dfs.append(ss)
-
-    out = pd.concat(dfs).set_index("Process").select_dtypes(include=[float]).T
-    out[f[0]] = y
-    out[f[0]] = out[f[0]].replace(0, f[1])
-    out[f[0]] = out[f[0]].replace(1, f[2])
-    dm = pd.melt(
-        out,
-        id_vars=f[0],
-        value_vars=out.columns,
-        var_name="Process",
-        value_name="mean log(p-signal)",
-    )
-    dm.iloc[:, -1] = dm.iloc[:, -1].astype(float)
-    sns.boxplot(
-        data=dm,
-        x="Process",
-        y="mean log(p-signal)",
-        hue=f[0],
-        showfliers=False,
-        linewidth=0.5,
-        ax=ax,
-    )
-    ax.set_xticklabels([textwrap.fill(t, 10) for t in list(cc.columns)], rotation=0)
-    ax.set_title("Processes Cluster " + str(cluster))
-
-
-def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
-    """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster"""
-    data = pd.melt(
-        id_vars=id_var,
-        value_vars=centers.columns[:-1],
-        value_name="p-signal",
-        var_name="Cluster",
-        frame=centers,
+    plot_cluster_kinase_distances(
+        distances, model.get_pssms(clusters=top_clusters), axes[6], num_hits=2
     )
-    sns.violinplot(
-        x="Cluster",
-        y="p-signal",
-        hue=id_var,
-        data=data,
-        dodge=True,
-        ax=ax,
-        linewidth=0.25,
-    )
-    ax.legend(prop={"size": 8}, loc=loc)
-
-    if not isinstance(pvals, bool):
-        for ii, s in enumerate(pvals["Significant"]):
-            y, h, col = data["p-signal"].max(), 0.05, "k"
-            if s == "NS":
-                continue
-            elif s == "<0.05":
-                mark = "*"
-            else:
-                mark = "**"
-            ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20)
-
-
-def calculate_mannW_pvals(centers, col, feature1, feature2):
-    """Compute Mann Whitney rank test p-values corrected for multiple tests."""
-    pvals, clus = [], []
-    for ii in centers.columns[:-1]:
-        x = centers.loc[:, [ii, col]]
-        x1 = x[x[col] == feature1].iloc[:, 0]
-        x2 = x[x[col] == feature2].iloc[:, 0]
-        pval = mannwhitneyu(x1, x2)[1]
-        pvals.append(pval)
-        clus.append(ii)
-    return dict(zip(clus, multipletests(pvals)[1]))
-
-
-def build_pval_matrix(ncl, pvals):
-    """Build data frame with pvalues per cluster"""
-    data = pd.DataFrame()
-    data["Clusters"] = pvals.keys()
-    data["p-value"] = pvals.values()
-    signif = []
-    for val in pvals.values():
-        if 0.01 < val < 0.05:
-            signif.append("<0.05")
-        elif val < 0.01:
-            signif.append("<0.01")
-        else:
-            signif.append("NS")
-    data["Significant"] = signif
-    return data
+    return f
diff --git a/ddmc/logistic_regression.py b/ddmc/logistic_regression.py
@@ -19,7 +19,7 @@ def plotClusterCoefficients(ax: Axes, lr, hue=None, xlabels=False, title=False):
         coefs_["Sample"] = [l.split("_")[1] for l in hue]
         hue = "Sample"
     else:
-        coefs_["Cluster"] = np.arange(coefs_.shape[0]) + 1
+        coefs_["Cluster"] = np.arange(coefs_.shape[0])
     if xlabels:
         coefs_["Cluster"] = xlabels
     p = sns.barplot(