Skip to content

Commit

Permalink
Some fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
aarmey committed Jan 21, 2024
1 parent 7fb1c7a commit 6bd6040
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 178 deletions.
164 changes: 33 additions & 131 deletions ddmc/figures/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@
import sys
import time
from string import ascii_uppercase
from matplotlib import gridspec, pyplot as plt
from matplotlib import gridspec, pyplot as plt, axes
import seaborn as sns
import numpy as np
import pandas as pd
import svgutils.transform as st
import logomaker as lm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from ..clustering import DDMC
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from ..pre_processing import MeanCenter
from ..motifs import KinToPhosphotypeDict


def getSetup(figsize, gridd, multz=None):
def getSetup(
figsize: tuple[int, int], gridd: tuple[int, int], multz: None | dict = None
) -> tuple:
"""Establish figure set-up with subplots."""
sns.set(
style="whitegrid",
Expand All @@ -34,7 +34,7 @@ def getSetup(figsize, gridd, multz=None):

# Setup plotting space and grid
f = plt.figure(figsize=figsize, constrained_layout=True)
gs1 = gridspec.GridSpec(*gridd, figure=f)
gs1 = gridspec.GridSpec(gridd[0], gridd[1], figure=f)

# Get list of axis objects
x = 0
Expand All @@ -50,7 +50,7 @@ def getSetup(figsize, gridd, multz=None):
return (ax, f)


def subplotLabel(axs):
def subplotLabel(axs: list[axes.Axes]):
"""Place subplot labels on the list of axes."""
for ii, ax in enumerate(axs):
ax.text(
Expand All @@ -64,18 +64,14 @@ def subplotLabel(axs):
)


def overlayCartoon(
figFile, cartoonFile, x, y, scalee=1, scale_x=1, scale_y=1, rotate=None
):
def overlayCartoon(figFile, cartoonFile, x, y, scalee: float = 1.0):
"""Add cartoon to a figure file."""

# Overlay Figure cartoons
template = st.fromfile(figFile)
cartoon = st.fromfile(cartoonFile).getroot()

cartoon.moveto(x, y, scale_x=scalee * scale_x, scale_y=scalee * scale_y)
if rotate:
cartoon.rotate(rotate, x, y)
cartoon.moveto(x, y, scale_x=scalee, scale_y=scalee) # type: ignore

template.append(cartoon)
template.save(figFile)
Expand Down Expand Up @@ -117,124 +113,30 @@ def genFigure():
print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n")


def ComputeCenters(X, d, i, ddmc, ncl):
"""Calculate cluster centers of different algorithms."""
# k-means
labels = KMeans(n_clusters=ncl).fit(d.T).labels_
x_ = X.copy()
x_["Cluster"] = labels
c_kmeans = x_.groupby("Cluster").mean().T

# GMM
ddmc_data = DDMC(
i,
ncl=ncl,
SeqWeight=0,
distance_method=ddmc.distance_method,
random_state=ddmc.random_state,
).fit(d)
c_gmm = ddmc_data.transform()

# DDMC seq
ddmc_seq = DDMC(
i,
ncl=ncl,
SeqWeight=ddmc.SeqWeight + 20,
distance_method=ddmc.distance_method,
random_state=ddmc.random_state,
).fit(d)
ddmc_seq_c = ddmc_seq.transform()

# DDMC mix
ddmc_c = ddmc.transform()
return [c_kmeans, c_gmm, ddmc_seq_c, ddmc_c], [
"Unclustered",
"k-means",
"GMM",
"DDMC seq",
"DDMC mix",
]


def plotCenters(ax, model, xlabels, yaxis=False, drop=False):
centers = pd.DataFrame(model.transform()).T
centers.columns = xlabels
if drop:
centers = centers.drop(drop)
num_peptides = [
np.count_nonzero(model.labels() == jj)
for jj in range(1, model.n_components + 1)
]
for i in range(centers.shape[0]):
cl = pd.DataFrame(centers.iloc[i, :]).T
m = pd.melt(
cl, value_vars=list(cl.columns), value_name="p-signal", var_name="Lines"
)
m["p-signal"] = m["p-signal"].astype("float64")
sns.lineplot(
x="Lines", y="p-signal", data=m, color="#658cbb", ax=ax[i], linewidth=2
)
ax[i].set_xticklabels(xlabels, rotation=45)
ax[i].set_xticks(np.arange(len(xlabels)))
ax[i].set_ylabel("$log_{10}$ p-signal")
ax[i].xaxis.set_tick_params(bottom=True)
ax[i].set_xlabel("")
ax[i].set_title(
"Cluster "
+ str(centers.index[i] + 1)
+ " Center "
+ "("
+ "n="
+ str(num_peptides[i])
+ ")"
)
if yaxis:
ax[i].set_ylim([yaxis[0], yaxis[1]])


def plotMotifs(pssms, axes, titles=False, yaxis=False):
def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
"""Generate logo plots of a list of PSSMs"""
for i, ax in enumerate(axes):
pssm = pssms[i].T
if pssm.shape[0] == 11:
pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
elif pssm.shape[0] == 9:
pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4]
logo = lm.Logo(
pssm,
font_name="Arial",
vpad=0.1,
width=0.8,
flip_below=False,
center_values=False,
ax=ax,
)
logo.ax.set_ylabel("log_{2} (Enrichment Score)")
logo.style_xticks(anchor=1, spacing=1)
if titles:
logo.ax.set_title(titles[i] + " Motif")
else:
logo.ax.set_title("Motif Cluster " + str(i + 1))
if yaxis:
logo.ax.set_ylim([yaxis[0], yaxis[1]])


def plot_LassoCoef(ax, model, title=False):
"""Plot Lasso Coefficients"""
coefs = pd.DataFrame(model.coef_).T
coefs.index += 1
coefs = coefs.reset_index()
coefs.columns = ["Cluster", "Viability", "Apoptosis", "Migration", "Island"]
m = pd.melt(
coefs,
id_vars="Cluster",
value_vars=list(coefs.columns)[1:],
var_name="Phenotype",
value_name="Coefficient",
pssm = pssm.T
if pssm.shape[0] == 11:
pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
elif pssm.shape[0] == 9:
pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4]
logo = lm.Logo(
pssm,
font_name="Arial",
vpad=0.1,
width=0.8,
flip_below=False,
center_values=False,
ax=ax,
)
sns.barplot(x="Cluster", y="Coefficient", hue="Phenotype", data=m, ax=ax)
if title:
ax.set_title(title)
logo.ax.set_ylabel("log_{2} (Enrichment Score)")
logo.style_xticks(anchor=1, spacing=1)
if titles:
logo.ax.set_title(titles + " Motif")
else:
logo.ax.set_title("Motif Cluster 1")
if yaxis:
logo.ax.set_ylim([yaxis[0], yaxis[1]])


def plotDistanceToUpstreamKinase(
Expand Down Expand Up @@ -400,7 +302,7 @@ def calculate_mannW_pvals(centers, col, feature1, feature2):
return dict(zip(clus, multipletests(pvals)[1]))


def build_pval_matrix(ncl, pvals):
def build_pval_matrix(ncl, pvals) -> pd.DataFrame:
"""Build data frame with pvalues per cluster"""
data = pd.DataFrame()
data["Clusters"] = pvals.keys()
Expand All @@ -417,7 +319,7 @@ def build_pval_matrix(ncl, pvals):
return data


def TumorType(X):
def TumorType(X: pd.DataFrame) -> pd.DataFrame:
"""Add NAT vs Tumor column."""
tumortype = []
for i in range(X.shape[0]):
Expand Down Expand Up @@ -523,7 +425,7 @@ def TransformCenters(model, X):
def HotColdBehavior(centers):
# Import Cold-Hot Tumor data
y = (
pd.read_csv("ddmc/data/CPTAC_LUAD/Hot_Cold.csv")
pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv")
.dropna(axis=1)
.sort_values(by="Sample ID")
)
Expand Down
2 changes: 1 addition & 1 deletion ddmc/figures/figureM3.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def makeFigure():
erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]])
erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3))
erk2.index = AAlist
plotMotifs([erk2], axes=[ax[5]], titles=["ERK2"])
plotMotifs(erk2, axes=ax[5], titles="ERK2")

# ERK2 prediction
# Import signaling data
Expand Down
2 changes: 1 addition & 1 deletion ddmc/figures/figureM4.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def makeFigure():
d = X.select_dtypes(include=[float]).T
i = X["Sequence"]

return f # TODO: This code is broken.
return f # TODO: This code is broken.

# Plot mean AUCs per model
p = pd.read_csv("ddmc/data/Performance/preds_phenotypes_rs_15cl.csv").iloc[:, 1:]
Expand Down
14 changes: 1 addition & 13 deletions ddmc/figures/figureM5.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..clustering import DDMC
from .common import subplotLabel, getSetup
from ..logistic_regression import plotClusterCoefficients, plotROC
from .common import plotDistanceToUpstreamKinase
from .common import plotDistanceToUpstreamKinase, TumorType
from ..pca import plotPCA
from ..pre_processing import filter_NaNpeptides

Expand Down Expand Up @@ -176,18 +176,6 @@ def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"):
ax.set_title("Processes Cluster " + str(cluster))


def TumorType(X):
"""Add NAT vs Tumor column."""
tumortype = []
for i in range(X.shape[0]):
if X["Patient_ID"][i].endswith(".N"):
tumortype.append("NAT")
else:
tumortype.append("Tumor")
X["Type"] = tumortype
return X


def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
"""Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster"""
data = pd.melt(
Expand Down
4 changes: 3 additions & 1 deletion ddmc/figures/figureM7.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def makeFigure():
ax[0].legend(loc="lower left", prop={"size": 10})

# Logistic Regression
lr = LogisticRegressionCV(cv=15, solver="saga", n_jobs=-1, penalty="l1")
lr = LogisticRegressionCV(
cv=15, solver="saga", n_jobs=-1, penalty="l1", max_iter=10000
)
plotROC(ax[1], lr, cent1.iloc[:, :-1].values, y, cv_folds=4, title="ROC TI")
plotClusterCoefficients(
ax[2], lr.fit(cent1.iloc[:, :-1], y.values), title="TI weights"
Expand Down
2 changes: 1 addition & 1 deletion ddmc/figures/figureMS2.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def makeFigure():
xlabels = [20, 21, 22, 23, 24, 25]
for ii, cc in enumerate(cl_num):
cluster = "Cluster " + str(cc)
plotMotifs([pssms[ii]], axes=[ax[ii]], titles=[cluster], yaxis=[0, 10])
plotMotifs(pssms[ii], axes=ax[ii], titles=cluster, yaxis=[0, 10])
if ii not in ylabels:
ax[ii].set_ylabel("")
ax[ii].get_yaxis().set_visible(False)
Expand Down
40 changes: 12 additions & 28 deletions ddmc/figures/figureMS4.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import seaborn as sns
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cluster import KMeans
from sklearn.preprocessing import label_binarize
from ..clustering import DDMC
from .common import subplotLabel, getSetup
from ..pre_processing import filter_NaNpeptides
Expand Down Expand Up @@ -42,9 +43,8 @@ def makeFigure():
z = TumorType(d)
z.iloc[:, -1] = z.iloc[:, -1].replace("Normal", "NAT")
d = z.iloc[:, 1:-1]
y = z.iloc[:, -1]
y = y.replace("NAT", 0)
y = y.replace("Tumor", 1)

y = label_binarize(z["Type"], classes=["NAT", "Tumor"])

# DDMC ROC
ncl = 30
Expand All @@ -70,48 +70,32 @@ def makeFigure():
# Tumor vs NAT unclustered
plotROC(ax[0], lr, d.values, y, cv_folds=4, title="ROC unclustered")
ax[0].set_title("Unclustered ROC")
plot_unclustered_LRcoef(ax[1], lr, d, y, z)
plot_unclustered_LRcoef(ax[1], lr, d, y)

# k-means
labels = KMeans(n_clusters=ncl).fit(d.T).labels_
x_ = X.copy()
x_["Cluster"] = labels
c_kmeans = x_.groupby("Cluster").mean().T
c_kmeans.columns = list(np.arange(ncl) + 1)
km_lr = lr.fit(c_kmeans, y)
plotROC(ax[3], km_lr, c_kmeans.values, y, cv_folds=4, title="ROC k-means")
kmeans = KMeans(n_clusters=ncl).fit(d.T)

plotROC(ax[3], lr, kmeans.cluster_centers_.T, y, cv_folds=4, title="ROC k-means")
ax[3].set_title("k-means ROC")

# GMM
gmm = DDMC(
X["Sequence"], n_components=ncl, SeqWeight=0, distance_method="Binomial"
).fit(d)
x_ = X.copy()
x_["Cluster"] = gmm.labels()
c_gmm = x_.groupby("Cluster").mean().T
gmm_lr = lr.fit(c_gmm, y)
plotROC(ax[4], gmm_lr, c_gmm.values, y, cv_folds=4, title="ROC GMM")

plotROC(ax[4], lr, gmm.transform(), y, cv_folds=4, title="ROC GMM")
ax[4].set_title("GMM ROC")

return f


def plot_unclustered_LRcoef(ax, lr, d, y, z, title=False):
def plot_unclustered_LRcoef(ax, lr, X: pd.DataFrame, y: np.ndarray):
"""Plot logistic regression coefficients of unclustered data"""
weights = []
w = pd.DataFrame()
for _ in range(3):
lr = LogisticRegressionCV(
cv=3,
solver="saga",
max_iter=10000,
n_jobs=-1,
penalty="elasticnet",
l1_ratios=[0.85],
class_weight="balanced",
)
w["Coefficients"] = lr.fit(d, y).coef_[0]
w["p-sites"] = z.columns[2:]
w["Coefficients"] = lr.fit(X, y).coef_[0]
w["p-sites"] = X.columns[2:]
weights.append(w)

coefs = pd.concat(weights)
Expand Down
2 changes: 1 addition & 1 deletion ddmc/figures/figureMS7.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def makeFigure():

# Fit DDMC to complete data
d = np.array(X.select_dtypes(include=["float64"]).T)
i = X.select_dtypes(include=["object"])
i = X["Sequence"]

assert np.all(np.isfinite(d))
model_min = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit(
Expand Down
Loading

0 comments on commit 6bd6040

Please sign in to comment.