Skip to content

Commit

Permalink
remake figure 5
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 1, 2024
1 parent 2f59463 commit b190377
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 185 deletions.
2 changes: 1 addition & 1 deletion ddmc/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(
self,
n_components: int,
seq_weight: float,
distance_method: Literal["PAM250", "Binomial"],
distance_method: Literal["PAM250", "Binomial"] = "Binomial",
random_state=None,
max_iter=200,
tol=1e-4,
Expand Down
3 changes: 3 additions & 0 deletions ddmc/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,6 @@ def get_hot_cold_labels(self):
hot_cold = hot_cold.replace("Hot-tumor enriched", 1)
hot_cold = hot_cold.dropna()
return np.squeeze(hot_cold)

def get_tumor_or_nat(self, samples: Sequence[str]) -> np.ndarray[bool]:
return ~np.array([sample.endswith(".N") for sample in samples])
4 changes: 1 addition & 3 deletions ddmc/figures/figureM3.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
).fit(abund)

# get cluster centers
centers = pd.DataFrame(ddmc.transform())
centers.columns = np.arange(ddmc.n_components) + 1
centers = ddmc.transform(as_df=True)

# parse inhibitor names from sample names
inhibitors = [s.split(".")[1].split(".")[0] for s in abund.columns]
Expand Down Expand Up @@ -103,7 +102,6 @@ def plot_fig_3abd(ax_a, ax_b, ax_d):
ax_a.set_title("PCA Scores")
ax_a.set_xlabel("PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10)
ax_a.set_ylabel("PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10)
ax_a.legend(prop={"size": 8})

# plot loadings
sns.scatterplot(
Expand Down
287 changes: 107 additions & 180 deletions ddmc/figures/figureM5.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,119 @@
"""
This creates Figure 5: Tumor vs NAT analysis
"""

import numpy as np
import pandas as pd
import seaborn as sns
import textwrap
import mygene
from scipy.stats import mannwhitneyu
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.multitest import multipletests
from .common import getSetup, getDDMC_CPTAC
from ..logistic_regression import plotClusterCoefficients, plotROC
from .common import plot_distance_to_upstream_kinase, TumorType
from ..pca import plotPCA
from ..pre_processing import filter_NaNpeptides

from ddmc.clustering import DDMC
from ddmc.datasets import CPTAC, select_peptide_subset
from ddmc.figures.common import getSetup
from ddmc.figures.common import plot_cluster_kinase_distances
from ddmc.logistic_regression import plotClusterCoefficients, plotROC


def makeFigure():
"""Get a list of the axis objects and create a figure"""
# Get list of axis objects
ax, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1})
axes, f = getSetup((11, 10), (3, 3), multz={0: 1, 4: 1})
cptac = CPTAC()
p_signal = select_peptide_subset(cptac.get_p_signal(), keep_ratio=0.1)
model = DDMC(n_components=30, seq_weight=100, max_iter=5).fit(p_signal)

# Fit DDMC
model, X = getDDMC_CPTAC(n_components=30, SeqWeight=100.0)

# Normalize
centers = pd.DataFrame(model.transform()).T
centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
centers.iloc[:, :]
)
centers = centers.T
centers.columns = np.arange(model.n_components) + 1
centers["Patient_ID"] = X.columns[4:]
centers = TumorType(centers).set_index("Patient_ID")
centers = centers.drop([14, 24], axis=1) # Drop cluster 19, contains only 1 peptide
centers = model.transform(as_df=True)
centers.loc[:, :] = StandardScaler(with_std=False).fit_transform(centers.values)
is_tumor = cptac.get_tumor_or_nat(centers.index)

# first plot heatmap of clusters
# lim = 1.5
# sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg
ax[0].axis("off")

# PCA and Hypothesis Testing
pvals = calculate_mannW_pvals(centers, "Type", "NAT", "Tumor")
pvals = build_pval_matrix(model.n_components, pvals)
plotPCA(
ax[1:3],
centers.reset_index(),
2,
["Patient_ID", "Type"],
"Cluster",
hue_scores="Type",
style_scores="Type",
pvals=pvals.iloc[:, -1].values,
axes[0].axis("off")

# get p value of tumor vs NAT for each cluster
pvals = []
centers_tumor = centers[is_tumor]
centers_nat = centers[~is_tumor]
for col in centers.columns:
pvals.append(mannwhitneyu(centers_tumor[col], centers_nat[col])[1])
pvals = multipletests(pvals)[1]

# run PCA on cluster centers
pca = PCA(n_components=2)
scores = pca.fit_transform(centers) # sample by PCA component
loadings = pca.components_ # PCA component by cluster
variance_explained = np.round(pca.explained_variance_ratio_, 2)

# plot scores
sns.scatterplot(
x=scores[:, 0],
y=scores[:, 1],
hue=is_tumor,
ax=axes[1],
**{"linewidth": 0.5, "edgecolor": "k"},
)
axes[1].legend(loc="lower left", prop={"size": 9}, title="Tumor", fontsize=9)
axes[1].set_title("PCA Scores")
axes[1].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[1].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
)
plot_clusters_binaryfeatures(centers, "Type", ax[3], pvals=pvals, loc="lower left")

# Transform to Binary
c = centers.select_dtypes(include=["float64"])
tt = centers.iloc[:, -1]
tt = tt.replace("NAT", 0)
tt = tt.replace("Tumor", 1)
# plot loadings
sns.scatterplot(
x=loadings[0],
y=loadings[1],
ax=axes[2],
hue=pvals < 0.01,
**{"linewidth": 0.5, "edgecolor": "k"},
)
axes[2].set_title("PCA Loadings")
axes[2].set_xlabel(
"PC1 (" + str(int(variance_explained[0] * 100)) + "%)", fontsize=10
)
axes[2].set_ylabel(
"PC2 (" + str(int(variance_explained[1] * 100)) + "%)", fontsize=10
)
axes[2].legend(title="p < 0.01", prop={"size": 8})
for j, txt in enumerate(centers.columns):
axes[2].annotate(
txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
)

# plot tumor vs nat by cluster
df_violin = (
centers.assign(is_tumor=is_tumor)
.reset_index()
.melt(
id_vars="is_tumor",
value_vars=centers.columns,
value_name="p-signal",
var_name="Cluster",
)
)
sns.violinplot(
data=df_violin,
x="Cluster",
y="p-signal",
hue="is_tumor",
dodge=True,
ax=axes[3],
linewidth=0.25,
)
axes[3].legend(prop={"size": 8})

annotation_height = df_violin["p-signal"].max() + 0.02
for i, pval in enumerate(pvals):
if pval < 0.05:
annotation = "*"
elif pval < 0.01:
annotation = "**"
else:
continue
axes[3].text(
i, annotation_height, annotation, ha="center", va="bottom", fontsize=10
)

# Logistic Regression
lr = LogisticRegressionCV(
Expand All @@ -74,140 +125,16 @@ def makeFigure():
l1_ratios=[0.85],
class_weight="balanced",
)
plotROC(ax[4], lr, c.values, tt, cv_folds=4, return_mAUC=False)
plotClusterCoefficients(ax[5], lr)
ax[5].set_xticklabels(centers.columns[:-1])
plotROC(lr, centers.values, is_tumor, cv_folds=4, return_mAUC=False, ax=axes[4])

# Upstream Kinases
plot_distance_to_upstream_kinase(model, [6, 15, 20], ax[6], num_hits=2)
plotClusterCoefficients(axes[5], lr)

return f
top_clusters = np.argsort(np.abs(lr.coef_.squeeze()))[-3:]

# plot predicted kinases for most weighted clusters
distances = model.predict_upstream_kinases()[top_clusters]

def make_BPtoGenes_table(X, cluster):
d = X[["Clusters", "Description", "geneID"]]
d = d[d["Clusters"] == cluster]
gAr = d[["geneID"]].values
bpAr = d[["Description"]].values
mg = mygene.MyGeneInfo()
BPtoGenesDict = {}
for ii, arr in enumerate(gAr):
gg = mg.querymany(
list(arr[0].split("/")),
scopes="entrezgene",
fields="symbol",
species="human",
returnall=False,
as_dataframe=True,
)
BPtoGenesDict[bpAr[ii][0]] = list(gg["symbol"])
return pd.DataFrame(dict([(k, pd.Series(v)) for k, v in BPtoGenesDict.items()]))


def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"):
""""Plot BPs enriched per cluster""" ""
if gene_set == "WP":
gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[
:, 1:
]
elif gene_set == "onco":
gsea = pd.read_csv(
"ddmc/data/cluster_analysis/CPTAC_GSEA_ONCO_results.csv"
).iloc[:, 1:]
elif gene_set == "Immuno":
gsea = pd.read_csv("ddmc/data/cluster_analysis/CPTAC_GSEA_WP_results.csv").iloc[
:, 1:
]
cc = make_BPtoGenes_table(gsea, cluster)
cl = X[X["Cluster"] == cluster].set_index("Gene")
dfs = []
for ii in range(cc.shape[1]):
ss = cl.loc[cc.iloc[:, ii].dropna()].reset_index()
ss["Process"] = cc.columns[ii]
dfs.append(ss)

out = pd.concat(dfs).set_index("Process").select_dtypes(include=[float]).T
out[f[0]] = y
out[f[0]] = out[f[0]].replace(0, f[1])
out[f[0]] = out[f[0]].replace(1, f[2])
dm = pd.melt(
out,
id_vars=f[0],
value_vars=out.columns,
var_name="Process",
value_name="mean log(p-signal)",
)
dm.iloc[:, -1] = dm.iloc[:, -1].astype(float)
sns.boxplot(
data=dm,
x="Process",
y="mean log(p-signal)",
hue=f[0],
showfliers=False,
linewidth=0.5,
ax=ax,
)
ax.set_xticklabels([textwrap.fill(t, 10) for t in list(cc.columns)], rotation=0)
ax.set_title("Processes Cluster " + str(cluster))


def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
"""Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster"""
data = pd.melt(
id_vars=id_var,
value_vars=centers.columns[:-1],
value_name="p-signal",
var_name="Cluster",
frame=centers,
plot_cluster_kinase_distances(
distances, model.get_pssms(clusters=top_clusters), axes[6], num_hits=2
)
sns.violinplot(
x="Cluster",
y="p-signal",
hue=id_var,
data=data,
dodge=True,
ax=ax,
linewidth=0.25,
)
ax.legend(prop={"size": 8}, loc=loc)

if not isinstance(pvals, bool):
for ii, s in enumerate(pvals["Significant"]):
y, h, col = data["p-signal"].max(), 0.05, "k"
if s == "NS":
continue
elif s == "<0.05":
mark = "*"
else:
mark = "**"
ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20)


def calculate_mannW_pvals(centers, col, feature1, feature2):
"""Compute Mann Whitney rank test p-values corrected for multiple tests."""
pvals, clus = [], []
for ii in centers.columns[:-1]:
x = centers.loc[:, [ii, col]]
x1 = x[x[col] == feature1].iloc[:, 0]
x2 = x[x[col] == feature2].iloc[:, 0]
pval = mannwhitneyu(x1, x2)[1]
pvals.append(pval)
clus.append(ii)
return dict(zip(clus, multipletests(pvals)[1]))


def build_pval_matrix(ncl, pvals):
"""Build data frame with pvalues per cluster"""
data = pd.DataFrame()
data["Clusters"] = pvals.keys()
data["p-value"] = pvals.values()
signif = []
for val in pvals.values():
if 0.01 < val < 0.05:
signif.append("<0.05")
elif val < 0.01:
signif.append("<0.01")
else:
signif.append("NS")
data["Significant"] = signif
return data
return f
2 changes: 1 addition & 1 deletion ddmc/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def plotClusterCoefficients(ax: Axes, lr, hue=None, xlabels=False, title=False):
coefs_["Sample"] = [l.split("_")[1] for l in hue]
hue = "Sample"
else:
coefs_["Cluster"] = np.arange(coefs_.shape[0]) + 1
coefs_["Cluster"] = np.arange(coefs_.shape[0])
if xlabels:
coefs_["Cluster"] = xlabels
p = sns.barplot(
Expand Down

0 comments on commit b190377

Please sign in to comment.