diff --git a/ddmc/clustering.py b/ddmc/clustering.py index aed76630..9a4c7794 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -23,7 +23,7 @@ def __init__( self, n_components: int, seq_weight: float, - distance_method: Literal["PAM250", "Binomial"], + distance_method: Literal["PAM250", "Binomial"] = "Binomial", random_state=None, max_iter=200, tol=1e-4, diff --git a/ddmc/datasets.py b/ddmc/datasets.py index 1f5da3d5..14c674df 100644 --- a/ddmc/datasets.py +++ b/ddmc/datasets.py @@ -91,3 +91,6 @@ def get_hot_cold_labels(self): hot_cold = hot_cold.replace("Hot-tumor enriched", 1) hot_cold = hot_cold.dropna() return np.squeeze(hot_cold) + + def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]: + return ~np.array([sample.endswith(".N") for sample in samples]) diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py index d41f9288..67d15e2e 100644 --- a/ddmc/figures/figureM3.py +++ b/ddmc/figures/figureM3.py @@ -57,8 +57,7 @@ def plot_fig_3abd(ax_a, ax_b, ax_d): ).fit(abund) # get cluster centers - centers = pd.DataFrame(ddmc.transform()) - centers.columns = np.arange(ddmc.n_components) + 1 + centers = ddmc.transform(as_df=True) # parse inhibitor names from sample names inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns] @@ -103,7 +102,6 @@ def plot_fig_3abd(ax_a, ax_b, ax_d): ax_a.set_title("PCA Scores") ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10) ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10) - ax_a.legend(prop={"size": 8}) # plot loadings sns.scatterplot( diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py index b5e85f06..ff6d5a96 100644 --- a/ddmc/figures/figureM5.py +++ b/ddmc/figures/figureM5.py @@ -1,68 +1,119 @@ -""" -This creates Figure 5: Tumor vs NAT analysis -""" - import numpy as np -import pandas as pd import seaborn as sns -import textwrap -import mygene from scipy.stats import mannwhitneyu from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA from statsmodels.stats.multitest import multipletests -from .common import getSetup, getDDMC_CPTAC -from ..logistic_regression import plotClusterCoefficients, plotROC -from .common import plot_distance_to_upstream_kinase, TumorType -from ..pca import plotPCA -from ..pre_processing import filter_NaNpeptides + +from ddmc.clustering import DDMC +from ddmc.datasets import CPTAC, select_peptide_subset +from ddmc.figures.common import getSetup +from ddmc.figures.common import plot_cluster_kinase_distances +from ddmc.logistic_regression import plotClusterCoefficients, plotROC def makeFigure(): - """Get a list of the axis objects and create a figure""" - # Get list of axis objects - ax, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1}) + axes, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1}) + cptac = CPTAC() + p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=0.1) + model = DDMC(n_components=30, seq_weight=100, max_iter=5).fit(p_signal) - # Fit DDMC - model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0) - - # Normalize - centers = pd.DataFrame(model.transform()).T - centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform( - centers.iloc[:, :] - ) - centers = centers.T - centers.columns = np.arange(model.n_components) + 1 - centers["Patient_ID"] = X.columns[4:] - centers = TumorType(centers).set_index("Patient_ID") - centers = centers.drop([14, 24], axis=1) # Drop cluster 19, contains only 1 peptide + centers = model.transform(as_df=True) + centers.loc[:, :] = StandardScaler(with_std=False).fit_transform(centers.values) + is_tumor = cptac.get_tumor_or_nat(centers.index) # first plot heatmap of clusters # lim = 1.5 # sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg - ax[0].axis("off") - - # PCA and Hypothesis Testing - pvals = calculate_mannW_pvals(centers, "Type", "NAT", "Tumor") - pvals = build_pval_matrix(model.n_components, pvals) - plotPCA( - ax[1:3], - centers.reset_index(), - 2, - ["Patient_ID", "Type"], - "Cluster", - hue_scores="Type", - style_scores="Type", - pvals=pvals.iloc[:, -1].values, + axes[0].axis("off") + + # get p value of tumor vs NAT for each cluster + pvals = [] + centers_tumor = centers[is_tumor] + centers_nat = centers[~is_tumor] + for col in centers.columns: + pvals.append(mannwhitneyu(centers_tumor[col], centers_nat[col])[1]) + pvals = multipletests(pvals)[1] + + # run PCA on cluster centers + pca = PCA(n_components=2) + scores = pca.fit_transform(centers) # sample by PCA component + loadings = pca.components_ # PCA component by cluster + variance_explained = np.round(pca.explained_variance_ratio_, 2) + + # plot scores + sns.scatterplot( + x=scores[:, 0], + y=scores[:, 1], + hue=is_tumor, + ax=axes[1], + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + axes[1].legend(loc="lower left", prop={"size": 9}, title="Tumor", fontsize=9) + axes[1].set_title("PCA Scores") + axes[1].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 + ) + axes[1].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 ) - plot_clusters_binaryfeatures(centers, "Type", ax[3], pvals=pvals, loc="lower left") - # Transform to Binary - c = centers.select_dtypes(include=["float64"]) - tt = centers.iloc[:, -1] - tt = tt.replace("NAT", 0) - tt = tt.replace("Tumor", 1) + # plot loadings + sns.scatterplot( + x=loadings[0], + y=loadings[1], + ax=axes[2], + hue=pvals < 0.01, + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + axes[2].set_title("PCA Loadings") + axes[2].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 + ) + axes[2].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 + ) + axes[2].legend(title="p < 0.01", prop={"size": 8}) + for j, txt in enumerate(centers.columns): + axes[2].annotate( + txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 + ) + + # plot tumor vs nat by cluster + df_violin = ( + centers.assign(is_tumor=is_tumor) + .reset_index() + .melt( + id_vars="is_tumor", + value_vars=centers.columns, + value_name="p-signal", + var_name="Cluster", + ) + ) + sns.violinplot( + data=df_violin, + x="Cluster", + y="p-signal", + hue="is_tumor", + dodge=True, + ax=axes[3], + linewidth=0.25, + ) + axes[3].legend(prop={"size": 8}) + + annotation_height = df_violin["p-signal"].max() + 0.02 + for i, pval in enumerate(pvals): + if pval < 0.05: + annotation = "*" + elif pval < 0.01: + annotation = "**" + else: + continue + axes[3].text( + i, annotation_height, annotation, ha="center", va="bottom", fontsize=10 + ) # Logistic Regression lr = LogisticRegressionCV( @@ -74,140 +125,16 @@ def makeFigure(): l1_ratios=[0.85], class_weight="balanced", ) - plotROC(ax[4], lr, c.values, tt, cv_folds=4, return_mAUC=False) - plotClusterCoefficients(ax[5], lr) - ax[5].set_xticklabels(centers.columns[:-1]) + plotROC(lr, centers.values, is_tumor, cv_folds=4, return_mAUC=False, ax=axes[4]) - # Upstream Kinases - plot_distance_to_upstream_kinase(model, [6, 15, 20], ax[6], num_hits=2) + plotClusterCoefficients(axes[5], lr) - return f + top_clusters = np.argsort(np.abs(lr.coef_.squeeze()))[-3:] + # plot predicted kinases for most weighted clusters + distances = model.predict_upstream_kinases()[top_clusters] -def make_BPtoGenes_table(X, cluster): - d = X[["Clusters", "Description", "geneID"]] - d = d[d["Clusters"] == cluster] - gAr = d[["geneID"]].values - bpAr = d[["Description"]].values - mg = mygene.MyGeneInfo() - BPtoGenesDict = {} - for ii, arr in enumerate(gAr): - gg = mg.querymany( - list(arr[0].split("/")), - scopes="entrezgene", - fields="symbol", - species="human", - returnall=False, - as_dataframe=True, - ) - BPtoGenesDict[bpAr[ii][0]] = list(gg["symbol"]) - return pd.DataFrame(dict([(k, pd.Series(v)) for k, v in BPtoGenesDict.items()])) - - -def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"): - """"Plot BPs enriched per cluster""" "" - if gene_set == "WP": - gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[ - :, 1: - ] - elif gene_set == "onco": - gsea = pd.read_csv( - "ddmc/data/cluster_analysis/CPTAC_GSEA_ONCO_results.csv" - ).iloc[:, 1:] - elif gene_set == "Immuno": - gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[ - :, 1: - ] - cc = make_BPtoGenes_table(gsea, cluster) - cl = X[X["Cluster"] == cluster].set_index("Gene") - dfs = [] - for ii in range(cc.shape[1]): - ss = cl.loc[cc.iloc[:, ii].dropna()].reset_index() - ss["Process"] = cc.columns[ii] - dfs.append(ss) - - out = pd.concat(dfs).set_index("Process").select_dtypes(include=[float]).T - out[f[0]] = y - out[f[0]] = out[f[0]].replace(0, f[1]) - out[f[0]] = out[f[0]].replace(1, f[2]) - dm = pd.melt( - out, - id_vars=f[0], - value_vars=out.columns, - var_name="Process", - value_name="mean log(p-signal)", - ) - dm.iloc[:, -1] = dm.iloc[:, -1].astype(float) - sns.boxplot( - data=dm, - x="Process", - y="mean log(p-signal)", - hue=f[0], - showfliers=False, - linewidth=0.5, - ax=ax, - ) - ax.set_xticklabels([textwrap.fill(t, 10) for t in list(cc.columns)], rotation=0) - ax.set_title("Processes Cluster " + str(cluster)) - - -def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"): - """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster""" - data = pd.melt( - id_vars=id_var, - value_vars=centers.columns[:-1], - value_name="p-signal", - var_name="Cluster", - frame=centers, + plot_cluster_kinase_distances( + distances, model.get_pssms(clusters=top_clusters), axes[6], num_hits=2 ) - sns.violinplot( - x="Cluster", - y="p-signal", - hue=id_var, - data=data, - dodge=True, - ax=ax, - linewidth=0.25, - ) - ax.legend(prop={"size": 8}, loc=loc) - - if not isinstance(pvals, bool): - for ii, s in enumerate(pvals["Significant"]): - y, h, col = data["p-signal"].max(), 0.05, "k" - if s == "NS": - continue - elif s == "<0.05": - mark = "*" - else: - mark = "**" - ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20) - - -def calculate_mannW_pvals(centers, col, feature1, feature2): - """Compute Mann Whitney rank test p-values corrected for multiple tests.""" - pvals, clus = [], [] - for ii in centers.columns[:-1]: - x = centers.loc[:, [ii, col]] - x1 = x[x[col] == feature1].iloc[:, 0] - x2 = x[x[col] == feature2].iloc[:, 0] - pval = mannwhitneyu(x1, x2)[1] - pvals.append(pval) - clus.append(ii) - return dict(zip(clus, multipletests(pvals)[1])) - - -def build_pval_matrix(ncl, pvals): - """Build data frame with pvalues per cluster""" - data = pd.DataFrame() - data["Clusters"] = pvals.keys() - data["p-value"] = pvals.values() - signif = [] - for val in pvals.values(): - if 0.01 < val < 0.05: - signif.append("<0.05") - elif val < 0.01: - signif.append("<0.01") - else: - signif.append("NS") - data["Significant"] = signif - return data + return f diff --git a/ddmc/logistic_regression.py b/ddmc/logistic_regression.py index 4caa96cc..876c1c90 100644 --- a/ddmc/logistic_regression.py +++ b/ddmc/logistic_regression.py @@ -19,7 +19,7 @@ def plotClusterCoefficients(ax: Axes, lr, hue=None, xlabels=False, title=False): coefs_["Sample"] = [l.split("_")[1] for l in hue] hue = "Sample" else: - coefs_["Cluster"] = np.arange(coefs_.shape[0]) + 1 + coefs_["Cluster"] = np.arange(coefs_.shape[0]) if xlabels: coefs_["Cluster"] = xlabels p = sns.barplot(