Some fixes

meyer-lab · Jan 21, 2024 · 6bd6040 · 6bd6040
1 parent 7fb1c7a
commit 6bd6040
Show file tree

Hide file tree

Showing 9 changed files with 54 additions and 178 deletions.
diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py
@@ -4,22 +4,22 @@
 import sys
 import time
 from string import ascii_uppercase
-from matplotlib import gridspec, pyplot as plt
+from matplotlib import gridspec, pyplot as plt, axes
 import seaborn as sns
 import numpy as np
 import pandas as pd
 import svgutils.transform as st
 import logomaker as lm
 from sklearn.preprocessing import StandardScaler
-from sklearn.cluster import KMeans
-from ..clustering import DDMC
 from scipy.stats import mannwhitneyu
 from statsmodels.stats.multitest import multipletests
 from ..pre_processing import MeanCenter
 from ..motifs import KinToPhosphotypeDict
 
 
-def getSetup(figsize, gridd, multz=None):
+def getSetup(
+    figsize: tuple[int, int], gridd: tuple[int, int], multz: None | dict = None
+) -> tuple:
     """Establish figure set-up with subplots."""
     sns.set(
         style="whitegrid",
@@ -34,7 +34,7 @@ def getSetup(figsize, gridd, multz=None):
 
     # Setup plotting space and grid
     f = plt.figure(figsize=figsize, constrained_layout=True)
-    gs1 = gridspec.GridSpec(*gridd, figure=f)
+    gs1 = gridspec.GridSpec(gridd[0], gridd[1], figure=f)
 
     # Get list of axis objects
     x = 0
@@ -50,7 +50,7 @@ def getSetup(figsize, gridd, multz=None):
     return (ax, f)
 
 
-def subplotLabel(axs):
+def subplotLabel(axs: list[axes.Axes]):
     """Place subplot labels on the list of axes."""
     for ii, ax in enumerate(axs):
         ax.text(
@@ -64,18 +64,14 @@ def subplotLabel(axs):
         )
 
 
-def overlayCartoon(
-    figFile, cartoonFile, x, y, scalee=1, scale_x=1, scale_y=1, rotate=None
-):
+def overlayCartoon(figFile, cartoonFile, x, y, scalee: float = 1.0):
     """Add cartoon to a figure file."""
 
     # Overlay Figure cartoons
     template = st.fromfile(figFile)
     cartoon = st.fromfile(cartoonFile).getroot()
 
-    cartoon.moveto(x, y, scale_x=scalee * scale_x, scale_y=scalee * scale_y)
-    if rotate:
-        cartoon.rotate(rotate, x, y)
+    cartoon.moveto(x, y, scale_x=scalee, scale_y=scalee)  # type: ignore
 
     template.append(cartoon)
     template.save(figFile)
@@ -117,124 +113,30 @@ def genFigure():
     print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n")
 
 
-def ComputeCenters(X, d, i, ddmc, ncl):
-    """Calculate cluster centers of  different algorithms."""
-    # k-means
-    labels = KMeans(n_clusters=ncl).fit(d.T).labels_
-    x_ = X.copy()
-    x_["Cluster"] = labels
-    c_kmeans = x_.groupby("Cluster").mean().T
-
-    # GMM
-    ddmc_data = DDMC(
-        i,
-        ncl=ncl,
-        SeqWeight=0,
-        distance_method=ddmc.distance_method,
-        random_state=ddmc.random_state,
-    ).fit(d)
-    c_gmm = ddmc_data.transform()
-
-    # DDMC seq
-    ddmc_seq = DDMC(
-        i,
-        ncl=ncl,
-        SeqWeight=ddmc.SeqWeight + 20,
-        distance_method=ddmc.distance_method,
-        random_state=ddmc.random_state,
-    ).fit(d)
-    ddmc_seq_c = ddmc_seq.transform()
-
-    # DDMC mix
-    ddmc_c = ddmc.transform()
-    return [c_kmeans, c_gmm, ddmc_seq_c, ddmc_c], [
-        "Unclustered",
-        "k-means",
-        "GMM",
-        "DDMC seq",
-        "DDMC mix",
-    ]
-
-
-def plotCenters(ax, model, xlabels, yaxis=False, drop=False):
-    centers = pd.DataFrame(model.transform()).T
-    centers.columns = xlabels
-    if drop:
-        centers = centers.drop(drop)
-    num_peptides = [
-        np.count_nonzero(model.labels() == jj)
-        for jj in range(1, model.n_components + 1)
-    ]
-    for i in range(centers.shape[0]):
-        cl = pd.DataFrame(centers.iloc[i, :]).T
-        m = pd.melt(
-            cl, value_vars=list(cl.columns), value_name="p-signal", var_name="Lines"
-        )
-        m["p-signal"] = m["p-signal"].astype("float64")
-        sns.lineplot(
-            x="Lines", y="p-signal", data=m, color="#658cbb", ax=ax[i], linewidth=2
-        )
-        ax[i].set_xticklabels(xlabels, rotation=45)
-        ax[i].set_xticks(np.arange(len(xlabels)))
-        ax[i].set_ylabel("$log_{10}$ p-signal")
-        ax[i].xaxis.set_tick_params(bottom=True)
-        ax[i].set_xlabel("")
-        ax[i].set_title(
-            "Cluster "
-            + str(centers.index[i] + 1)
-            + " Center "
-            + "("
-            + "n="
-            + str(num_peptides[i])
-            + ")"
-        )
-        if yaxis:
-            ax[i].set_ylim([yaxis[0], yaxis[1]])
-
-
-def plotMotifs(pssms, axes, titles=False, yaxis=False):
+def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
     """Generate logo plots of a list of PSSMs"""
-    for i, ax in enumerate(axes):
-        pssm = pssms[i].T
-        if pssm.shape[0] == 11:
-            pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
-        elif pssm.shape[0] == 9:
-            pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4]
-        logo = lm.Logo(
-            pssm,
-            font_name="Arial",
-            vpad=0.1,
-            width=0.8,
-            flip_below=False,
-            center_values=False,
-            ax=ax,
-        )
-        logo.ax.set_ylabel("log_{2} (Enrichment Score)")
-        logo.style_xticks(anchor=1, spacing=1)
-        if titles:
-            logo.ax.set_title(titles[i] + " Motif")
-        else:
-            logo.ax.set_title("Motif Cluster " + str(i + 1))
-        if yaxis:
-            logo.ax.set_ylim([yaxis[0], yaxis[1]])
-
-
-def plot_LassoCoef(ax, model, title=False):
-    """Plot Lasso Coefficients"""
-    coefs = pd.DataFrame(model.coef_).T
-    coefs.index += 1
-    coefs = coefs.reset_index()
-    coefs.columns = ["Cluster", "Viability", "Apoptosis", "Migration", "Island"]
-    m = pd.melt(
-        coefs,
-        id_vars="Cluster",
-        value_vars=list(coefs.columns)[1:],
-        var_name="Phenotype",
-        value_name="Coefficient",
+    pssm = pssm.T
+    if pssm.shape[0] == 11:
+        pssm.index = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
+    elif pssm.shape[0] == 9:
+        pssm.index = [-5, -4, -3, -2, -1, 1, 2, 3, 4]
+    logo = lm.Logo(
+        pssm,
+        font_name="Arial",
+        vpad=0.1,
+        width=0.8,
+        flip_below=False,
+        center_values=False,
+        ax=ax,
     )
-    sns.barplot(x="Cluster", y="Coefficient", hue="Phenotype", data=m, ax=ax)
-    if title:
-        ax.set_title(title)
+    logo.ax.set_ylabel("log_{2} (Enrichment Score)")
+    logo.style_xticks(anchor=1, spacing=1)
+    if titles:
+        logo.ax.set_title(titles + " Motif")
+    else:
+        logo.ax.set_title("Motif Cluster 1")
+    if yaxis:
+        logo.ax.set_ylim([yaxis[0], yaxis[1]])
 
 
 def plotDistanceToUpstreamKinase(
@@ -400,7 +302,7 @@ def calculate_mannW_pvals(centers, col, feature1, feature2):
     return dict(zip(clus, multipletests(pvals)[1]))
 
 
-def build_pval_matrix(ncl, pvals):
+def build_pval_matrix(ncl, pvals) -> pd.DataFrame:
     """Build data frame with pvalues per cluster"""
     data = pd.DataFrame()
     data["Clusters"] = pvals.keys()
@@ -417,7 +319,7 @@ def build_pval_matrix(ncl, pvals):
     return data
 
 
-def TumorType(X):
+def TumorType(X: pd.DataFrame) -> pd.DataFrame:
     """Add NAT vs Tumor column."""
     tumortype = []
     for i in range(X.shape[0]):
@@ -523,7 +425,7 @@ def TransformCenters(model, X):
 def HotColdBehavior(centers):
     # Import Cold-Hot Tumor data
     y = (
-        pd.read_csv("ddmc/data/CPTAC_LUAD/Hot_Cold.csv")
+        pd.read_csv("ddmc/data/MS/CPTAC/Hot_Cold.csv")
         .dropna(axis=1)
         .sort_values(by="Sample ID")
     )

diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py
@@ -90,7 +90,7 @@ def makeFigure():
     erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]])
     erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3))
     erk2.index = AAlist
-    plotMotifs([erk2], axes=[ax[5]], titles=["ERK2"])
+    plotMotifs(erk2, axes=ax[5], titles="ERK2")
 
     # ERK2 prediction
     # Import signaling data

diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py
@@ -40,7 +40,7 @@ def makeFigure():
     d = X.select_dtypes(include=[float]).T
     i = X["Sequence"]
 
-    return f # TODO: This code is broken.
+    return f  # TODO: This code is broken.
 
     # Plot mean AUCs per model
     p = pd.read_csv("ddmc/data/Performance/preds_phenotypes_rs_15cl.csv").iloc[:, 1:]

diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py
@@ -16,7 +16,7 @@
 from ..clustering import DDMC
 from .common import subplotLabel, getSetup
 from ..logistic_regression import plotClusterCoefficients, plotROC
-from .common import plotDistanceToUpstreamKinase
+from .common import plotDistanceToUpstreamKinase, TumorType
 from ..pca import plotPCA
 from ..pre_processing import filter_NaNpeptides
 
@@ -176,18 +176,6 @@ def plot_enriched_processes(ax, X, y, f, cluster, gene_set="WP"):
     ax.set_title("Processes Cluster " + str(cluster))
 
 
-def TumorType(X):
-    """Add NAT vs Tumor column."""
-    tumortype = []
-    for i in range(X.shape[0]):
-        if X["Patient_ID"][i].endswith(".N"):
-            tumortype.append("NAT")
-        else:
-            tumortype.append("Tumor")
-    X["Type"] = tumortype
-    return X
-
-
 def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
     """Plot p-signal of binary features (tumor vs NAT or mutational status) per cluster"""
     data = pd.melt(

diff --git a/ddmc/figures/figureM7.py b/ddmc/figures/figureM7.py
@@ -83,7 +83,9 @@ def makeFigure():
     ax[0].legend(loc="lower left", prop={"size": 10})
 
     # Logistic Regression
-    lr = LogisticRegressionCV(cv=15, solver="saga", n_jobs=-1, penalty="l1")
+    lr = LogisticRegressionCV(
+        cv=15, solver="saga", n_jobs=-1, penalty="l1", max_iter=10000
+    )
     plotROC(ax[1], lr, cent1.iloc[:, :-1].values, y, cv_folds=4, title="ROC TI")
     plotClusterCoefficients(
         ax[2], lr.fit(cent1.iloc[:, :-1], y.values), title="TI weights"

diff --git a/ddmc/figures/figureMS2.py b/ddmc/figures/figureMS2.py
@@ -44,7 +44,7 @@ def makeFigure():
     xlabels = [20, 21, 22, 23, 24, 25]
     for ii, cc in enumerate(cl_num):
         cluster = "Cluster " + str(cc)
-        plotMotifs([pssms[ii]], axes=[ax[ii]], titles=[cluster], yaxis=[0, 10])
+        plotMotifs(pssms[ii], axes=ax[ii], titles=cluster, yaxis=[0, 10])
         if ii not in ylabels:
             ax[ii].set_ylabel("")
             ax[ii].get_yaxis().set_visible(False)

diff --git a/ddmc/figures/figureMS4.py b/ddmc/figures/figureMS4.py
@@ -8,6 +8,7 @@
 import seaborn as sns
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.cluster import KMeans
+from sklearn.preprocessing import label_binarize
 from ..clustering import DDMC
 from .common import subplotLabel, getSetup
 from ..pre_processing import filter_NaNpeptides
@@ -42,9 +43,8 @@ def makeFigure():
     z = TumorType(d)
     z.iloc[:, -1] = z.iloc[:, -1].replace("Normal", "NAT")
     d = z.iloc[:, 1:-1]
-    y = z.iloc[:, -1]
-    y = y.replace("NAT", 0)
-    y = y.replace("Tumor", 1)
+
+    y = label_binarize(z["Type"], classes=["NAT", "Tumor"])
 
     # DDMC ROC
     ncl = 30
@@ -70,48 +70,32 @@ def makeFigure():
     # Tumor vs NAT unclustered
     plotROC(ax[0], lr, d.values, y, cv_folds=4, title="ROC unclustered")
     ax[0].set_title("Unclustered ROC")
-    plot_unclustered_LRcoef(ax[1], lr, d, y, z)
+    plot_unclustered_LRcoef(ax[1], lr, d, y)
 
     # k-means
-    labels = KMeans(n_clusters=ncl).fit(d.T).labels_
-    x_ = X.copy()
-    x_["Cluster"] = labels
-    c_kmeans = x_.groupby("Cluster").mean().T
-    c_kmeans.columns = list(np.arange(ncl) + 1)
-    km_lr = lr.fit(c_kmeans, y)
-    plotROC(ax[3], km_lr, c_kmeans.values, y, cv_folds=4, title="ROC k-means")
+    kmeans = KMeans(n_clusters=ncl).fit(d.T)
+
+    plotROC(ax[3], lr, kmeans.cluster_centers_.T, y, cv_folds=4, title="ROC k-means")
     ax[3].set_title("k-means ROC")
 
     # GMM
     gmm = DDMC(
         X["Sequence"], n_components=ncl, SeqWeight=0, distance_method="Binomial"
     ).fit(d)
-    x_ = X.copy()
-    x_["Cluster"] = gmm.labels()
-    c_gmm = x_.groupby("Cluster").mean().T
-    gmm_lr = lr.fit(c_gmm, y)
-    plotROC(ax[4], gmm_lr, c_gmm.values, y, cv_folds=4, title="ROC GMM")
+
+    plotROC(ax[4], lr, gmm.transform(), y, cv_folds=4, title="ROC GMM")
     ax[4].set_title("GMM ROC")
 
     return f
 
 
-def plot_unclustered_LRcoef(ax, lr, d, y, z, title=False):
+def plot_unclustered_LRcoef(ax, lr, X: pd.DataFrame, y: np.ndarray):
     """Plot logistic regression coefficients of unclustered data"""
     weights = []
     w = pd.DataFrame()
     for _ in range(3):
-        lr = LogisticRegressionCV(
-            cv=3,
-            solver="saga",
-            max_iter=10000,
-            n_jobs=-1,
-            penalty="elasticnet",
-            l1_ratios=[0.85],
-            class_weight="balanced",
-        )
-        w["Coefficients"] = lr.fit(d, y).coef_[0]
-        w["p-sites"] = z.columns[2:]
+        w["Coefficients"] = lr.fit(X, y).coef_[0]
+        w["p-sites"] = X.columns[2:]
         weights.append(w)
 
     coefs = pd.concat(weights)

diff --git a/ddmc/figures/figureMS7.py b/ddmc/figures/figureMS7.py
@@ -42,7 +42,7 @@ def makeFigure():
 
     # Fit DDMC to complete data
     d = np.array(X.select_dtypes(include=["float64"]).T)
-    i = X.select_dtypes(include=["object"])
+    i = X["Sequence"]
 
     assert np.all(np.isfinite(d))
     model_min = DDMC(i, n_components=30, SeqWeight=100, distance_method="Binomial").fit(