Skip to content

Commit

Permalink
Lots more cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
aarmey committed Jan 21, 2024
1 parent 6bd6040 commit fb310e2
Show file tree
Hide file tree
Showing 15 changed files with 113 additions and 286 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: poetry install
- name: Build figures
run: make -i all
run: make -j 2 all
- name: Upload files
uses: actions/upload-artifact@v4
with:
Expand Down
125 changes: 75 additions & 50 deletions ddmc/figures/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,29 @@
import sys
import time
from string import ascii_uppercase
from matplotlib import gridspec, pyplot as plt, axes
from matplotlib import gridspec, pyplot as plt, axes, rcParams
import seaborn as sns
import numpy as np
import pandas as pd
import svgutils.transform as st
import logomaker as lm
from sklearn.preprocessing import StandardScaler
from ..pre_processing import filter_NaNpeptides
from ..clustering import DDMC
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from ..pre_processing import MeanCenter
from ..motifs import KinToPhosphotypeDict


rcParams["font.sans-serif"] = "Arial"


def getSetup(
figsize: tuple[int, int], gridd: tuple[int, int], multz: None | dict = None
figsize: tuple[int, int],
gridd: tuple[int, int],
multz: None | dict = None,
labels=True,
) -> tuple:
"""Establish figure set-up with subplots."""
sns.set(
Expand Down Expand Up @@ -47,6 +55,9 @@ def getSetup(
x += multz[x]
x += 1

if labels:
subplotLabel(ax)

return (ax, f)


Expand All @@ -64,7 +75,9 @@ def subplotLabel(axs: list[axes.Axes]):
)


def overlayCartoon(figFile, cartoonFile, x, y, scalee: float = 1.0):
def overlayCartoon(
figFile: str, cartoonFile: str, x: float, y: float, scalee: float = 1.0
):
"""Add cartoon to a figure file."""

# Overlay Figure cartoons
Expand Down Expand Up @@ -113,6 +126,26 @@ def genFigure():
print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n")


def getDDMC_CPTAC(n_components: int, SeqWeight: float) -> DDMC:
# Import signaling data
X = filter_NaNpeptides(
pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
tmt=2,
)
d = X.select_dtypes(include=[float]).T
i = X["Sequence"]

# Fit DDMC
model = DDMC(
i,
n_components=n_components,
SeqWeight=SeqWeight,
distance_method="Binomial",
random_state=5,
).fit(d)
return model


def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
"""Generate logo plots of a list of PSSMs"""
pssm = pssm.T
Expand Down Expand Up @@ -140,11 +173,10 @@ def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):


def plotDistanceToUpstreamKinase(
model,
clusters,
model: DDMC,
clusters: list[int],
ax,
kind="strip",
num_hits=5,
num_hits: int = 5,
additional_pssms=False,
add_labels=False,
title=False,
Expand All @@ -161,52 +193,45 @@ def plotDistanceToUpstreamKinase(
if isinstance(add_labels, list):
clusters += add_labels
data = ukin_mc.sort_values(by="Kinase").set_index("Kinase")[clusters]
if kind == "heatmap":
sns.heatmap(data.T, ax=ax, xticklabels=data.index)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=7)
ax.set_ylabel("Cluster")

elif kind == "strip":
data = pd.melt(
data.reset_index(),
id_vars="Kinase",
value_vars=list(data.columns),
var_name="Cluster",
value_name="Frobenius Distance",
)
if isinstance(add_labels, list):
# Actual ERK predictions
data["Cluster"] = data["Cluster"].astype(str)
d1 = data[~data["Cluster"].str.contains("_S")]
sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0])
print(cOG)
AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1)

# Shuffled
d2 = data[data["Kinase"] == "ERK2"]
d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]]
d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]]
sns.stripplot(
data=d2,
x="Cluster",
y="Frobenius Distance",
hue="Shuffled",
ax=ax[1],
size=8,
)
ax[1].set_title("ERK2 Shuffled Positions")
ax[1].legend(prop={"size": 10}, loc="lower left")
DrawArrows(ax[1], d2)

else:
sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax)
AnnotateUpstreamKinases(model, clusters, ax, data, num_hits)
if title:
ax.set_title(title)
data = pd.melt(
data.reset_index(),
id_vars="Kinase",
value_vars=list(data.columns),
var_name="Cluster",
value_name="Frobenius Distance",
)
if isinstance(add_labels, list):
# Actual ERK predictions
data["Cluster"] = data["Cluster"].astype(str)
d1 = data[~data["Cluster"].str.contains("_S")]
sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0])
print(cOG)
AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1)

# Shuffled
d2 = data[data["Kinase"] == "ERK2"]
d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]]
d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]]
sns.stripplot(
data=d2,
x="Cluster",
y="Frobenius Distance",
hue="Shuffled",
ax=ax[1],
size=8,
)
ax[1].set_title("ERK2 Shuffled Positions")
ax[1].legend(prop={"size": 10}, loc="lower left")
DrawArrows(ax[1], d2)
else:
sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax)
AnnotateUpstreamKinases(model, clusters, ax, data, num_hits)
if title:
ax.set_title(title)


def AnnotateUpstreamKinases(model, clusters, ax, data, num_hits=1):
def AnnotateUpstreamKinases(model: DDMC, clusters, ax, data, num_hits: int = 1):
"""Annotate upstream kinase predictions"""
data.iloc[:, 1] = data.iloc[:, 1].astype(str)
pssms, _ = model.pssms()
Expand Down
16 changes: 1 addition & 15 deletions ddmc/figures/figureM2.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""
This creates Figure 2: Evaluation of Imputating Missingness
"""
import matplotlib
import numpy as np
from scipy.stats import gmean
import pandas as pd
import seaborn as sns
from .common import subplotLabel, getSetup
from .common import getSetup
from ..clustering import DDMC
from ..pre_processing import filter_NaNpeptides
from fancyimpute import IterativeSVD
Expand All @@ -17,19 +16,6 @@ def makeFigure():
# Get list of axis objects
ax, f = getSetup((10, 10), (3, 3), multz={0: 2})

# Set plotting format
matplotlib.rcParams["font.sans-serif"] = "Arial"
sns.set(
style="whitegrid",
font_scale=1,
color_codes=True,
palette="colorblind",
rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
)

# Add subplot labels
subplotLabel(ax)

# diagram explaining reconstruction process
ax[0].axis("off")

Expand Down
18 changes: 2 additions & 16 deletions ddmc/figures/figureM3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
from ..clustering import DDMC
from ..binomial import AAlist
from .common import subplotLabel, getSetup
from .common import getSetup
from ..pca import plotPCA
from .common import plotDistanceToUpstreamKinase, plotMotifs
from ..clustering import compute_control_pssm
Expand All @@ -23,19 +22,6 @@ def makeFigure():
# Get list of axis objects
ax, f = getSetup((12, 12), (3, 3), multz={3: 1})

# Add subplot labels
subplotLabel(ax)

# Set plotting format
matplotlib.rcParams["font.sans-serif"] = "Arial"
sns.set(
style="whitegrid",
font_scale=1,
color_codes=True,
palette="colorblind",
rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
)

# Import signaling data
x = preprocess_ebdt_mcf7()
d = x.select_dtypes(include=[float]).T
Expand Down Expand Up @@ -90,7 +76,7 @@ def makeFigure():
erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]])
erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3))
erk2.index = AAlist
plotMotifs(erk2, axes=ax[5], titles="ERK2")
plotMotifs(erk2, ax=ax[5], titles="ERK2")

# ERK2 prediction
# Import signaling data
Expand Down
37 changes: 8 additions & 29 deletions ddmc/figures/figureM4.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from ..clustering import DDMC
from .common import subplotLabel, getSetup, HotColdBehavior
from .common import getSetup, HotColdBehavior, getDDMC_CPTAC
from ..logistic_regression import plotROC
from ..pre_processing import filter_NaNpeptides

Expand All @@ -20,25 +19,11 @@ def makeFigure():
# Get list of axis objects
ax, f = getSetup((8, 4), (1, 3))

# Add subplot labels
subplotLabel(ax)

# Set plotting format
matplotlib.rcParams["font.sans-serif"] = "Arial"
sns.set(
style="whitegrid",
font_scale=0.5,
color_codes=True,
palette="colorblind",
rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
)

X = filter_NaNpeptides(
pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
tmt=2,
)
d = X.select_dtypes(include=[float]).T
i = X["Sequence"]

return f # TODO: This code is broken.

Expand All @@ -61,15 +46,9 @@ def makeFigure():
ax[0].legend(prop={"size": 5}, loc=0)

# Fit Data, Mix, and Seq Models
dataM = DDMC(
i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=5
).fit(d)
mixM = DDMC(
i, n_components=30, SeqWeight=250, distance_method="Binomial", random_state=5
).fit(d)
seqM = DDMC(
i, n_components=30, SeqWeight=1e6, distance_method="Binomial", random_state=5
).fit(d)
dataM = getDDMC_CPTAC(n_components=30, SeqWeight=0.0)
mixM = getDDMC_CPTAC(n_components=30, SeqWeight=250.0)
seqM = getDDMC_CPTAC(n_components=30, SeqWeight=1.0e6)
models = [dataM, mixM, seqM]

# Center to peptide distance
Expand All @@ -81,7 +60,7 @@ def makeFigure():
return f


def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35):
def calculate_AUCs_phenotypes(ax, X: pd.DataFrame, nRuns=3, n_components=35):
"""Plot mean AUCs per phenotype across weights."""
# Signaling
d = X.select_dtypes(include=[float]).T
Expand Down Expand Up @@ -124,7 +103,7 @@ def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35):
ax,
lr,
centers_gen.values,
y["STK11.mutation.status"],
y["STK11.mutation.status"].values,
cv_folds=3,
return_mAUC=True,
kfold="Repeated",
Expand Down Expand Up @@ -238,7 +217,7 @@ def merge_binary_vectors(y, mutant1, mutant2):
return pd.Series(y_)


def find_patients_with_NATandTumor(X, label, conc=False):
def find_patients_with_NATandTumor(X: pd.DataFrame, label, conc=False) -> pd.DataFrame:
"""Reshape data to display patients as rows and samples (Tumor and NAT per cluster) as columns.
Note that to do so, samples that don't have their tumor/NAT counterpart are dropped.
"""
Expand All @@ -259,7 +238,7 @@ def find_patients_with_NATandTumor(X, label, conc=False):
return X


def TransformCenters(model, X):
def TransformCenters(model: DDMC, X: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""For a given model, find centers and transform for regression."""
centers = pd.DataFrame(model.transform()).T
centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(
Expand Down
Loading

0 comments on commit fb310e2

Please sign in to comment.