diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py index d2fed372..c42eae44 100644 --- a/ddmc/figures/common.py +++ b/ddmc/figures/common.py @@ -4,7 +4,6 @@ import sys import time -from pathlib import Path from string import ascii_uppercase from matplotlib import gridspec, pyplot as plt, axes, rcParams import seaborn as sns @@ -12,13 +11,11 @@ import pandas as pd import svgutils.transform as st import logomaker as lm -from sklearn.preprocessing import StandardScaler -from ..pre_processing import filter_NaNpeptides -from ..clustering import DDMC from scipy.stats import mannwhitneyu from statsmodels.stats.multitest import multipletests from ..motifs import KinToPhosphotypeDict from ddmc.binomial import AAlist +from sklearn.decomposition import PCA rcParams["font.sans-serif"] = "Arial" @@ -249,6 +246,63 @@ def plot_p_signal_across_clusters_and_binary_feature( ax.text(i, annotation_height, annotation, ha="center", va="bottom", fontsize=10) +def plot_pca_on_cluster_centers( + centers: pd.DataFrame, + axes, + hue_scores: np.ndarray = None, + hue_scores_title: str = None, + hue_loadings: np.ndarray = None, + hue_loadings_title: str = None, +): + # run PCA on cluster centers + pca = PCA(n_components=2) + scores = pca.fit_transform(centers) # sample by PCA component + loadings = pca.components_ # PCA component by cluster + variance_explained = np.round(pca.explained_variance_ratio_, 2) + + # plot scores + sns.scatterplot( + x=scores[:, 0], + y=scores[:, 1], + hue=hue_scores, + ax=axes[0], + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + if hue_scores_title: + axes[0].legend( + loc="lower left", prop={"size": 9}, title=hue_scores_title, fontsize=9 + ) + axes[0].set_title("PCA Scores") + axes[0].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 + ) + axes[0].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 + ) + + # plot loadings + sns.scatterplot( + x=loadings[0], + y=loadings[1], + ax=axes[1], + hue=hue_loadings, + **{"linewidth": 0.5, "edgecolor": "k"}, + ) + if hue_loadings_title: + axes[1].legend(title="p < 0.01", prop={"size": 8}) + axes[1].set_title("PCA Loadings") + axes[1].set_xlabel( + "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 + ) + axes[1].set_ylabel( + "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 + ) + for j, txt in enumerate(centers.columns): + axes[1].annotate( + txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 + ) + + def ExportClusterFile(cluster, cptac=False, mcf7=False): """Export cluster SVG file for NetPhorest and GO analysis.""" if cptac: diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py index 67d15e2e..c0d1b592 100644 --- a/ddmc/figures/figureM3.py +++ b/ddmc/figures/figureM3.py @@ -9,17 +9,15 @@ from ..clustering import DDMC from ..binomial import AAlist from .common import getSetup -from ..pca import plotPCA from .common import ( - plot_distance_to_upstream_kinase, plot_motifs, plot_cluster_kinase_distances, + plot_pca_on_cluster_centers, ) from ..clustering import compute_control_pssm, get_pspl_pssm_distances from ..binomial import AAlist from ..motifs import DictProteomeNameToSeq, get_pspls from ..pre_processing import filter_NaNpeptides, separate_sequence_and_abundance -from sklearn.decomposition import PCA def makeFigure(): @@ -84,37 +82,9 @@ def plot_fig_3abd(ax_a, ax_b, ax_d): for drug in inhibitors ] - # run PCA on cluster centers - pca = PCA(n_components=2) - scores = pca.fit_transform(centers) # sample by PCA component - loadings = pca.components_ # PCA component by cluster - variance_explained = np.round(pca.explained_variance_ratio_, 2) - - # plot scores - sns.scatterplot( - x=scores[:, 0], - y=scores[:, 1], - hue=is_AKTi, - ax=ax_a, - **{"linewidth": 0.5, "edgecolor": "k"}, + plot_pca_on_cluster_centers( + centers, [ax_a, ax_b], hue_scores=is_AKTi, hue_scores_title="AKTi?" ) - ax_a.legend(loc="lower left", prop={"size": 9}, title="AKTi", fontsize=9) - ax_a.set_title("PCA Scores") - ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10) - ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10) - - # plot loadings - sns.scatterplot( - x=loadings[0], y=loadings[1], ax=ax_b, **{"linewidth": 0.5, "edgecolor": "k"} - ) - ax_b.set_title("PCA Loadings") - ax_b.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10) - ax_b.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10) - ax_b.legend(prop={"size": 8}) - for j, txt in enumerate(centers.columns): - ax_b.annotate( - txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 - ) # Plot kinase predictions for cluster 16 plot_cluster_kinase_distances( diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py index 28c6855b..d7ba4cb4 100644 --- a/ddmc/figures/figureM5.py +++ b/ddmc/figures/figureM5.py @@ -1,11 +1,9 @@ import numpy as np import seaborn as sns -from scipy.stats import mannwhitneyu from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA -from statsmodels.stats.multitest import multipletests from ddmc.clustering import DDMC from ddmc.datasets import CPTAC, select_peptide_subset @@ -14,6 +12,7 @@ plot_cluster_kinase_distances, get_pvals_across_clusters, plot_p_signal_across_clusters_and_binary_feature, + plot_pca_on_cluster_centers, ) from ddmc.logistic_regression import plotClusterCoefficients, plotROC @@ -33,51 +32,18 @@ def makeFigure(): # sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg axes[0].axis("off") - # run PCA on cluster centers - pca = PCA(n_components=2) - scores = pca.fit_transform(centers) # sample by PCA component - loadings = pca.components_ # PCA component by cluster - variance_explained = np.round(pca.explained_variance_ratio_, 2) - - # plot scores - sns.scatterplot( - x=scores[:, 0], - y=scores[:, 1], - hue=is_tumor, - ax=axes[1], - **{"linewidth": 0.5, "edgecolor": "k"}, - ) - axes[1].legend(loc="lower left", prop={"size": 9}, title="Tumor", fontsize=9) - axes[1].set_title("PCA Scores") - axes[1].set_xlabel( - "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 - ) - axes[1].set_ylabel( - "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 + plot_pca_on_cluster_centers( + centers, + axes[1:3], + hue_scores=is_tumor, + hue_scores_title="Tumor?", + hue_loadings=get_pvals_across_clusters(is_tumor, centers) < 0.01, + hue_loadings_title="p < 0.01", ) - # plot loadings - sns.scatterplot( - x=loadings[0], - y=loadings[1], - ax=axes[2], - hue=get_pvals_across_clusters(is_tumor, centers) < 0.01, - **{"linewidth": 0.5, "edgecolor": "k"}, + plot_p_signal_across_clusters_and_binary_feature( + is_tumor, centers, "is_tumor", axes[4] ) - axes[2].set_title("PCA Loadings") - axes[2].set_xlabel( - "PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10 - ) - axes[2].set_ylabel( - "PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10 - ) - axes[2].legend(title="p < 0.01", prop={"size": 8}) - for j, txt in enumerate(centers.columns): - axes[2].annotate( - txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10 - ) - - plot_p_signal_across_clusters_and_binary_feature(is_tumor, centers, "is_tumor", axes[4]) # Logistic Regression lr = LogisticRegressionCV( diff --git a/ddmc/pca.py b/ddmc/pca.py deleted file mode 100644 index 9a72bcde..00000000 --- a/ddmc/pca.py +++ /dev/null @@ -1,104 +0,0 @@ -""" PCA functions """ -from typing import List - -import pandas as pd -import numpy as np -import seaborn as sns -from sklearn.decomposition import PCA - - -def pca_dfs(scores, loadings, df, n_components, sIDX, lIDX): - """build PCA scores and loadings data frames.""" - dScor = pd.DataFrame() - dLoad = pd.DataFrame() - for i in range(n_components): - cpca = "PC" + str(i + 1) - dScor[cpca] = scores[:, i] - dLoad[cpca] = loadings[i, :] - - for j in sIDX: - dScor[j] = list(df[j]) - # populate the "Cluster" col with the names of the clusters from df - dLoad[lIDX] = df.select_dtypes(include=["float64"]).columns - return dScor, dLoad - - -def plotPCA( - axes: List, - cluster_centers: pd.DataFrame, - n_components: int, - scores_ind, - loadings_ind, - hue_scores=None, - style_scores=None, - pvals=None, - style_load=None, - legendOut=False, - quadrants=True, -): - """Plot PCA scores and loadings.""" - pca = PCA(n_components=n_components) - dScor_ = pca.fit_transform(cluster_centers.select_dtypes(include=["float64"])) - dLoad_ = pca.components_ - dScor_, dLoad_ = pca_dfs(dScor_, dLoad_, cluster_centers, n_components, scores_ind, loadings_ind) - varExp = np.round(pca.explained_variance_ratio_, 2) - - # Scores - sns.scatterplot( - x="PC1", - y="PC2", - data=dScor_, - hue=hue_scores, - style=style_scores, - ax=axes[0], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - axes[0].set_title("PCA Scores") - axes[0].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - axes[0].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - axes[0].legend(prop={"size": 8}) - if legendOut: - axes[0].legend( - bbox_to_anchor=(1.05, 1), - loc=2, - borderaxespad=0, - labelspacing=0.2, - prop={"size": 8}, - ) - - # Loadings - if isinstance(pvals, np.ndarray): - dLoad_["p-value"] = pvals - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - hue="p-value", - style=style_load, - ax=axes[1], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - else: - sns.scatterplot( - x="PC1", - y="PC2", - data=dLoad_, - style=style_load, - ax=axes[1], - **{"linewidth": 0.5, "edgecolor": "k"} - ) - - axes[1].set_title("PCA Loadings") - axes[1].set_xlabel("PC1 (" + str(int(varExp[0] * 100)) + "%)", fontsize=10) - axes[1].set_ylabel("PC2 (" + str(int(varExp[1] * 100)) + "%)", fontsize=10) - axes[1].legend(prop={"size": 8}) - for j, txt in enumerate(dLoad_[loadings_ind]): - axes[1].annotate( - txt, (dLoad_["PC1"][j] + 0.001, dLoad_["PC2"][j] + 0.001), fontsize=10 - ) - - if quadrants: - axes[0].axhline(0, ls="--", color="lightgrey") - axes[0].axvline(0, ls="--", color="lightgrey") - axes[1].axhline(0, ls="--", color="lightgrey") - axes[1].axvline(0, ls="--", color="lightgrey")