Skip to content

Commit

Permalink
refactor pca functions
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 2, 2024
1 parent d6c7fe0 commit 3ef804f
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 185 deletions.
62 changes: 58 additions & 4 deletions ddmc/figures/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,18 @@

import sys
import time
from pathlib import Path
from string import ascii_uppercase
from matplotlib import gridspec, pyplot as plt, axes, rcParams
import seaborn as sns
import numpy as np
import pandas as pd
import svgutils.transform as st
import logomaker as lm
from sklearn.preprocessing import StandardScaler
from ..pre_processing import filter_NaNpeptides
from ..clustering import DDMC
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from ..motifs import KinToPhosphotypeDict
from ddmc.binomial import AAlist
from sklearn.decomposition import PCA


rcParams["font.sans-serif"] = "Arial"
Expand Down Expand Up @@ -249,6 +246,63 @@ def plot_p_signal_across_clusters_and_binary_feature(
ax.text(i, annotation_height, annotation, ha="center", va="bottom", fontsize=10)


def plot_pca_on_cluster_centers(
centers: pd.DataFrame,
axes,
hue_scores: np.ndarray = None,
hue_scores_title: str = None,
hue_loadings: np.ndarray = None,
hue_loadings_title: str = None,
):
# run PCA on cluster centers
pca = PCA(n_components=2)
scores = pca.fit_transform(centers) # sample by PCA component
loadings = pca.components_ # PCA component by cluster
variance_explained = np.round(pca.explained_variance_ratio_, 2)

# plot scores
sns.scatterplot(
x=scores[:, 0],
y=scores[:, 1],
hue=hue_scores,
ax=axes[0],
**{"linewidth": 0.5, "edgecolor": "k"},
)
if hue_scores_title:
axes[0].legend(
loc="lower left", prop={"size": 9}, title=hue_scores_title, fontsize=9
)
axes[0].set_title("PCA Scores")
axes[0].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[0].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
)

# plot loadings
sns.scatterplot(
x=loadings[0],
y=loadings[1],
ax=axes[1],
hue=hue_loadings,
**{"linewidth": 0.5, "edgecolor": "k"},
)
if hue_loadings_title:
axes[1].legend(title="p < 0.01", prop={"size": 8})
axes[1].set_title("PCA Loadings")
axes[1].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[1].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
)
for j, txt in enumerate(centers.columns):
axes[1].annotate(
txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
)


def ExportClusterFile(cluster, cptac=False, mcf7=False):
"""Export cluster SVG file for NetPhorest and GO analysis."""
if cptac:
Expand Down
36 changes: 3 additions & 33 deletions ddmc/figures/figureM3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,15 @@
from ..clustering import DDMC
from ..binomial import AAlist
from .common import getSetup
from ..pca import plotPCA
from .common import (
plot_distance_to_upstream_kinase,
plot_motifs,
plot_cluster_kinase_distances,
plot_pca_on_cluster_centers,
)
from ..clustering import compute_control_pssm, get_pspl_pssm_distances
from ..binomial import AAlist
from ..motifs import DictProteomeNameToSeq, get_pspls
from ..pre_processing import filter_NaNpeptides, separate_sequence_and_abundance
from sklearn.decomposition import PCA


def makeFigure():
Expand Down Expand Up @@ -84,37 +82,9 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
for drug in inhibitors
]

# run PCA on cluster centers
pca = PCA(n_components=2)
scores = pca.fit_transform(centers) # sample by PCA component
loadings = pca.components_ # PCA component by cluster
variance_explained = np.round(pca.explained_variance_ratio_, 2)

# plot scores
sns.scatterplot(
x=scores[:, 0],
y=scores[:, 1],
hue=is_AKTi,
ax=ax_a,
**{"linewidth": 0.5, "edgecolor": "k"},
plot_pca_on_cluster_centers(
centers, [ax_a, ax_b], hue_scores=is_AKTi, hue_scores_title="AKTi?"
)
ax_a.legend(loc="lower left", prop={"size": 9}, title="AKTi", fontsize=9)
ax_a.set_title("PCA Scores")
ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10)
ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10)

# plot loadings
sns.scatterplot(
x=loadings[0], y=loadings[1], ax=ax_b, **{"linewidth": 0.5, "edgecolor": "k"}
)
ax_b.set_title("PCA Loadings")
ax_b.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10)
ax_b.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10)
ax_b.legend(prop={"size": 8})
for j, txt in enumerate(centers.columns):
ax_b.annotate(
txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
)

# Plot kinase predictions for cluster 16
plot_cluster_kinase_distances(
Expand Down
54 changes: 10 additions & 44 deletions ddmc/figures/figureM5.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import numpy as np
import seaborn as sns
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.multitest import multipletests

from ddmc.clustering import DDMC
from ddmc.datasets import CPTAC, select_peptide_subset
Expand All @@ -14,6 +12,7 @@
plot_cluster_kinase_distances,
get_pvals_across_clusters,
plot_p_signal_across_clusters_and_binary_feature,
plot_pca_on_cluster_centers,
)
from ddmc.logistic_regression import plotClusterCoefficients, plotROC

Expand All @@ -33,51 +32,18 @@ def makeFigure():
# sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg
axes[0].axis("off")

# run PCA on cluster centers
pca = PCA(n_components=2)
scores = pca.fit_transform(centers) # sample by PCA component
loadings = pca.components_ # PCA component by cluster
variance_explained = np.round(pca.explained_variance_ratio_, 2)

# plot scores
sns.scatterplot(
x=scores[:, 0],
y=scores[:, 1],
hue=is_tumor,
ax=axes[1],
**{"linewidth": 0.5, "edgecolor": "k"},
)
axes[1].legend(loc="lower left", prop={"size": 9}, title="Tumor", fontsize=9)
axes[1].set_title("PCA Scores")
axes[1].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[1].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
plot_pca_on_cluster_centers(
centers,
axes[1:3],
hue_scores=is_tumor,
hue_scores_title="Tumor?",
hue_loadings=get_pvals_across_clusters(is_tumor, centers) < 0.01,
hue_loadings_title="p < 0.01",
)

# plot loadings
sns.scatterplot(
x=loadings[0],
y=loadings[1],
ax=axes[2],
hue=get_pvals_across_clusters(is_tumor, centers) < 0.01,
**{"linewidth": 0.5, "edgecolor": "k"},
plot_p_signal_across_clusters_and_binary_feature(
is_tumor, centers, "is_tumor", axes[4]
)
axes[2].set_title("PCA Loadings")
axes[2].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[2].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
)
axes[2].legend(title="p < 0.01", prop={"size": 8})
for j, txt in enumerate(centers.columns):
axes[2].annotate(
txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
)

plot_p_signal_across_clusters_and_binary_feature(is_tumor, centers, "is_tumor", axes[4])

# Logistic Regression
lr = LogisticRegressionCV(
Expand Down
104 changes: 0 additions & 104 deletions ddmc/pca.py

This file was deleted.

0 comments on commit 3ef804f

Please sign in to comment.