Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
armaan-abraham committed Feb 2, 2024
1 parent d0fb555 commit e801ce5
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 316 deletions.
239 changes: 35 additions & 204 deletions ddmc/figures/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This file contains functions that are used in multiple figures.
"""

import sys
import time
from pathlib import Path
Expand Down Expand Up @@ -127,26 +128,6 @@ def genFigure():
print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n")


def getDDMC_CPTAC(n_components: int, SeqWeight: float):
# Import signaling data
X = filter_NaNpeptides(
pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
tmt=2,
)
d = X.select_dtypes(include=[float]).T
i = X["Sequence"]

# Fit DDMC
model = DDMC(
i,
n_components=n_components,
SeqWeight=SeqWeight,
distance_method="Binomial",
random_state=5,
).fit(d)
return model, X


def plot_motifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
"""Generate logo plots of a list of PSSMs"""
pssm = pssm.T
Expand All @@ -173,7 +154,9 @@ def plot_motifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
logo.ax.set_ylim([yaxis[0], yaxis[1]])


def plot_cluster_kinase_distances(distances: pd.DataFrame, pssms: np.ndarray, ax, num_hits=1):
def plot_cluster_kinase_distances(
distances: pd.DataFrame, pssms: np.ndarray, ax, num_hits=1
):
pssm_names = distances.columns

# melt distances
Expand Down Expand Up @@ -218,137 +201,50 @@ def plot_cluster_kinase_distances(distances: pd.DataFrame, pssms: np.ndarray, ax
ax.set_title("Kinase vs Cluster Motif")


def plot_distance_to_upstream_kinase(
model: DDMC,
clusters: list[int],
ax,
num_hits: int = 5,
additional_pssms=False,
add_labels=False,
title=False,
PsP_background=True,
):
"""Plot Frobenius norm between kinase PSPL and cluster PSSMs"""
ukin = model.predict_upstream_kinases(
additional_pssms=additional_pssms,
add_labels=add_labels,
PsP_background=PsP_background,
)
ukin_mc = MeanCenter(ukin, mc_col=True, mc_row=True)
cOG = np.array(clusters).copy()
if isinstance(add_labels, list):
clusters += add_labels
data = ukin_mc.sort_values(by="Kinase").set_index("Kinase")[clusters]

data = pd.melt(
data.reset_index(),
id_vars="Kinase",
value_vars=list(data.columns),
var_name="Cluster",
value_name="Frobenius Distance",
)
if isinstance(add_labels, list):
# Actual ERK predictions
data["Cluster"] = data["Cluster"].astype(str)
d1 = data[~data["Cluster"].str.contains("_S")]
sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0])
annotate_upstream_kinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1)

# Shuffled
d2 = data[data["Kinase"] == "ERK2"]
d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]]
d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]]
sns.stripplot(
data=d2,
x="Cluster",
y="Frobenius Distance",
hue="Shuffled",
ax=ax[1],
size=8,
)
ax[1].set_title("ERK2 Shuffled Positions")
ax[1].legend(prop={"size": 10}, loc="lower left")
draw_arrows(ax[1], d2)
else:
sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax)
annotate_upstream_kinases(model, clusters, ax, data, num_hits)
if title:
ax.set_title(title)
def get_pvals_across_clusters(
label: pd.Series[bool] | np.ndarray[bool], centers: pd.DataFrame | np.ndarray
) -> np.ndarray[float]:
pvals = []
centers_pos = centers[label]
centers_neg = centers[~label]
for i in range(centers.shape[1]):
pvals.append(mannwhitneyu(centers_pos[:, i], centers_neg[:, i])[1])
return multipletests(pvals)[1]


def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
"""Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster"""
data = pd.melt(
id_vars=id_var,
value_vars=centers.columns[:-1],
def plot_p_signal_across_clusters_and_binary_feature(
label: pd.Series[bool] | np.ndarray[bool],
centers: pd.DataFrame | np.ndarray,
label_name: str,
ax,
) -> None:
centers = centers.copy()
centers[label_name] = label
df_violin = centers.reset_index().melt(
id_vars=label_name,
value_vars=centers.columns,
value_name="p-signal",
var_name="Cluster",
frame=centers,
)
sns.violinplot(
data=df_violin,
x="Cluster",
y="p-signal",
hue=id_var,
data=data,
hue=label_name,
dodge=True,
ax=ax,
linewidth=0.25,
fliersize=2,
)
ax.legend(prop={"size": 8}, loc=loc)

if not isinstance(pvals, bool):
for ii, s in enumerate(pvals["Significant"]):
y, h, col = data["p-signal"].max(), 0.05, "k"
if s == "NS":
continue
elif s == "<0.05":
mark = "*"
else:
mark = "**"
ax.text(ii, y + h, mark, ha="center", va="bottom", color=col, fontsize=20)


def calculate_mannW_pvals(centers, col, feature1, feature2):
"""Compute Mann Whitney rank test p-values corrected for multiple tests."""
pvals, clus = [], []
for ii in centers.columns[:-1]:
x = centers.loc[:, [ii, col]]
x1 = x[x[col] == feature1].iloc[:, 0]
x2 = x[x[col] == feature2].iloc[:, 0]
pval = mannwhitneyu(x1, x2)[1]
pvals.append(pval)
clus.append(ii)
return dict(zip(clus, multipletests(pvals)[1]))


def build_pval_matrix(ncl, pvals) -> pd.DataFrame:
"""Build data frame with pvalues per cluster"""
data = pd.DataFrame()
data["Clusters"] = pvals.keys()
data["p-value"] = pvals.values()
signif = []
for val in pvals.values():
if 0.01 < val < 0.05:
signif.append("<0.05")
elif val < 0.01:
signif.append("<0.01")
ax.legend(prop={"size": 8})
annotation_height = df_violin["p-signal"].max() + 0.02
for i, pval in enumerate(get_pvals_across_clusters(label, centers)):
if pval < 0.05:
annotation = "*"
elif pval < 0.01:
annotation = "**"
else:
signif.append("NS")
data["Significant"] = signif
return data


def TumorType(X: pd.DataFrame) -> pd.DataFrame:
"""Add NAT vs Tumor column."""
tumortype = []
for i in range(X.shape[0]):
if X["Patient_ID"][i].endswith(".N"):
tumortype.append("NAT")
else:
tumortype.append("Tumor")
X["Type"] = tumortype
return X
continue
ax.text(i, annotation_height, annotation, ha="center", va="bottom", fontsize=10)


def ExportClusterFile(cluster, cptac=False, mcf7=False):
Expand Down Expand Up @@ -401,68 +297,3 @@ def ExportClusterFile(cluster, cptac=False, mcf7=False):
for gene in drop_list:
c = c[c["Gene"] != gene]
c.to_csv("Cluster_" + str(cluster) + ".csv")


def find_patients_with_NATandTumor(X, label, conc=False):
"""Reshape data to display patients as rows and samples (Tumor and NAT per cluster) as columns.
Note that to do so, samples that don't have their tumor/NAT counterpart are dropped.
"""
xT = X[~X[label].str.endswith(".N")].sort_values(by=label)
xN = X[X[label].str.endswith(".N")].sort_values(by=label)
l1 = list(xT[label])
l2 = [s.split(".N")[0] for s in xN[label]]
dif = [i for i in l1 + l2 if i not in l1 or i not in l2]
X = xT.set_index(label).drop(dif)
assert all(X.index.values == np.array(l2)), "Samples don't match"

if conc:
xN = xN.set_index(label)
xN.index = l2
xN.columns = [str(i) + "_Normal" for i in xN.columns]
X.columns = [str(i) + "_Tumor" for i in X.columns]
X = pd.concat([X, xN], axis=1)
return X


def TransformCenters(model, X):
"""For a given model, find centers and transform for regression."""
centers = pd.DataFrame(model.transform()).T
centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
centers.iloc[:, :]
)
centers = centers.T
centers.columns = np.arange(model.n_components) + 1
centers["Patient_ID"] = X.columns[4:]
centers1 = find_patients_with_NATandTumor(centers.copy(), "Patient_ID", conc=True)
centers2 = (
centers.loc[~centers["Patient_ID"].str.endswith(".N"), :]
.sort_values(by="Patient_ID")
.set_index("Patient_ID")
)
return centers1, centers2


def HotColdBehavior(centers):
# Import Cold-Hot Tumor data
y = (
pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv")
.dropna(axis=1)
.sort_values(by="Sample ID")
)
y = y.loc[~y["Sample ID"].str.endswith(".N"), :].set_index("Sample ID")
l1 = list(centers.index)
l2 = list(y.index)
dif = [i for i in l1 + l2 if i not in l1 or i not in l2]
centers = centers.drop(dif)

# Transform to binary
y = y.replace("Cold-tumor enriched", 0)
y = y.replace("Hot-tumor enriched", 1)
y = np.squeeze(y)

# Remove NAT-enriched samples
centers = centers.drop(y[y == "NAT enriched"].index)
y = y.drop(y[y == "NAT enriched"].index).astype(int)
assert all(centers.index.values == y.index.values), "Samples don't match"

return y, centers
51 changes: 8 additions & 43 deletions ddmc/figures/figureM5.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@

from ddmc.clustering import DDMC
from ddmc.datasets import CPTAC, select_peptide_subset
from ddmc.figures.common import getSetup, plot_cluster_kinase_distances
from ddmc.figures.common import (
getSetup,
plot_cluster_kinase_distances,
get_pvals_across_clusters,
plot_p_signal_across_clusters_and_binary_feature,
)
from ddmc.logistic_regression import plotClusterCoefficients, plotROC


Expand All @@ -28,14 +33,6 @@ def makeFigure():
# sns.clustermap(centers.set_index("Type").T, method="complete", cmap="bwr", vmax=lim, vmin=-lim, figsize=(15, 9)) Run in notebook and save as svg
axes[0].axis("off")

# get p value of tumor vs NAT for each cluster
pvals = []
centers_tumor = centers[is_tumor]
centers_nat = centers[~is_tumor]
for col in centers.columns:
pvals.append(mannwhitneyu(centers_tumor[col], centers_nat[col])[1])
pvals = multipletests(pvals)[1]

# run PCA on cluster centers
pca = PCA(n_components=2)
scores = pca.fit_transform(centers) # sample by PCA component
Expand Down Expand Up @@ -64,7 +61,7 @@ def makeFigure():
x=loadings[0],
y=loadings[1],
ax=axes[2],
hue=pvals < 0.01,
hue=get_pvals_across_clusters(is_tumor, centers) < 0.01,
**{"linewidth": 0.5, "edgecolor": "k"},
)
axes[2].set_title("PCA Loadings")
Expand All @@ -80,39 +77,7 @@ def makeFigure():
txt, (loadings[0][j] + 0.001, loadings[1][j] + 0.001), fontsize=10
)

# plot tumor vs nat by cluster
df_violin = (
centers.assign(is_tumor=is_tumor)
.reset_index()
.melt(
id_vars="is_tumor",
value_vars=centers.columns,
value_name="p-signal",
var_name="Cluster",
)
)
sns.violinplot(
data=df_violin,
x="Cluster",
y="p-signal",
hue="is_tumor",
dodge=True,
ax=axes[3],
linewidth=0.25,
)
axes[3].legend(prop={"size": 8})

annotation_height = df_violin["p-signal"].max() + 0.02
for i, pval in enumerate(pvals):
if pval < 0.05:
annotation = "*"
elif pval < 0.01:
annotation = "**"
else:
continue
axes[3].text(
i, annotation_height, annotation, ha="center", va="bottom", fontsize=10
)
plot_p_signal_across_clusters_and_binary_feature(is_tumor, centers, "is_tumor", axes[4])

# Logistic Regression
lr = LogisticRegressionCV(
Expand Down
Loading

0 comments on commit e801ce5

Please sign in to comment.