Lots more cleanup

meyer-lab · Jan 21, 2024 · fb310e2 · fb310e2
1 parent 6bd6040
commit fb310e2
Show file tree

Hide file tree

Showing 15 changed files with 113 additions and 286 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ jobs:
     - name: Install dependencies
       run: poetry install
     - name: Build figures
-      run: make -i all
+      run: make -j 2 all
     - name: Upload files
       uses: actions/upload-artifact@v4
       with:

diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py
@@ -4,21 +4,29 @@
 import sys
 import time
 from string import ascii_uppercase
-from matplotlib import gridspec, pyplot as plt, axes
+from matplotlib import gridspec, pyplot as plt, axes, rcParams
 import seaborn as sns
 import numpy as np
 import pandas as pd
 import svgutils.transform as st
 import logomaker as lm
 from sklearn.preprocessing import StandardScaler
+from ..pre_processing import filter_NaNpeptides
+from ..clustering import DDMC
 from scipy.stats import mannwhitneyu
 from statsmodels.stats.multitest import multipletests
 from ..pre_processing import MeanCenter
 from ..motifs import KinToPhosphotypeDict
 
 
+rcParams["font.sans-serif"] = "Arial"
+
+
 def getSetup(
-    figsize: tuple[int, int], gridd: tuple[int, int], multz: None | dict = None
+    figsize: tuple[int, int],
+    gridd: tuple[int, int],
+    multz: None | dict = None,
+    labels=True,
 ) -> tuple:
     """Establish figure set-up with subplots."""
     sns.set(
@@ -47,6 +55,9 @@ def getSetup(
             x += multz[x]
         x += 1
 
+    if labels:
+        subplotLabel(ax)
+
     return (ax, f)
 
 
@@ -64,7 +75,9 @@ def subplotLabel(axs: list[axes.Axes]):
         )
 
 
-def overlayCartoon(figFile, cartoonFile, x, y, scalee: float = 1.0):
+def overlayCartoon(
+    figFile: str, cartoonFile: str, x: float, y: float, scalee: float = 1.0
+):
     """Add cartoon to a figure file."""
 
     # Overlay Figure cartoons
@@ -113,6 +126,26 @@ def genFigure():
     print(f"Figure {sys.argv[1]} is done after {time.time() - start} seconds.\n")
 
 
+def getDDMC_CPTAC(n_components: int, SeqWeight: float) -> DDMC:
+    # Import signaling data
+    X = filter_NaNpeptides(
+        pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
+        tmt=2,
+    )
+    d = X.select_dtypes(include=[float]).T
+    i = X["Sequence"]
+
+    # Fit DDMC
+    model = DDMC(
+        i,
+        n_components=n_components,
+        SeqWeight=SeqWeight,
+        distance_method="Binomial",
+        random_state=5,
+    ).fit(d)
+    return model
+
+
 def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
     """Generate logo plots of a list of PSSMs"""
     pssm = pssm.T
@@ -140,11 +173,10 @@ def plotMotifs(pssm, ax: axes.Axes, titles=False, yaxis=False):
 
 
 def plotDistanceToUpstreamKinase(
-    model,
-    clusters,
+    model: DDMC,
+    clusters: list[int],
     ax,
-    kind="strip",
-    num_hits=5,
+    num_hits: int = 5,
     additional_pssms=False,
     add_labels=False,
     title=False,
@@ -161,52 +193,45 @@ def plotDistanceToUpstreamKinase(
     if isinstance(add_labels, list):
         clusters += add_labels
     data = ukin_mc.sort_values(by="Kinase").set_index("Kinase")[clusters]
-    if kind == "heatmap":
-        sns.heatmap(data.T, ax=ax, xticklabels=data.index)
-        cbar = ax.collections[0].colorbar
-        cbar.ax.tick_params(labelsize=7)
-        ax.set_ylabel("Cluster")
-
-    elif kind == "strip":
-        data = pd.melt(
-            data.reset_index(),
-            id_vars="Kinase",
-            value_vars=list(data.columns),
-            var_name="Cluster",
-            value_name="Frobenius Distance",
-        )
-        if isinstance(add_labels, list):
-            # Actual ERK predictions
-            data["Cluster"] = data["Cluster"].astype(str)
-            d1 = data[~data["Cluster"].str.contains("_S")]
-            sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0])
-            print(cOG)
-            AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1)
-
-            # Shuffled
-            d2 = data[data["Kinase"] == "ERK2"]
-            d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]]
-            d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]]
-            sns.stripplot(
-                data=d2,
-                x="Cluster",
-                y="Frobenius Distance",
-                hue="Shuffled",
-                ax=ax[1],
-                size=8,
-            )
-            ax[1].set_title("ERK2 Shuffled Positions")
-            ax[1].legend(prop={"size": 10}, loc="lower left")
-            DrawArrows(ax[1], d2)
 
-        else:
-            sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax)
-            AnnotateUpstreamKinases(model, clusters, ax, data, num_hits)
-            if title:
-                ax.set_title(title)
+    data = pd.melt(
+        data.reset_index(),
+        id_vars="Kinase",
+        value_vars=list(data.columns),
+        var_name="Cluster",
+        value_name="Frobenius Distance",
+    )
+    if isinstance(add_labels, list):
+        # Actual ERK predictions
+        data["Cluster"] = data["Cluster"].astype(str)
+        d1 = data[~data["Cluster"].str.contains("_S")]
+        sns.stripplot(data=d1, x="Cluster", y="Frobenius Distance", ax=ax[0])
+        print(cOG)
+        AnnotateUpstreamKinases(model, list(cOG) + ["ERK2+"], ax[0], d1, 1)
+
+        # Shuffled
+        d2 = data[data["Kinase"] == "ERK2"]
+        d2["Shuffled"] = ["_S" in s for s in d2["Cluster"]]
+        d2["Cluster"] = [s.split("_S")[0] for s in d2["Cluster"]]
+        sns.stripplot(
+            data=d2,
+            x="Cluster",
+            y="Frobenius Distance",
+            hue="Shuffled",
+            ax=ax[1],
+            size=8,
+        )
+        ax[1].set_title("ERK2 Shuffled Positions")
+        ax[1].legend(prop={"size": 10}, loc="lower left")
+        DrawArrows(ax[1], d2)
+    else:
+        sns.stripplot(data=data, x="Cluster", y="Frobenius Distance", ax=ax)
+        AnnotateUpstreamKinases(model, clusters, ax, data, num_hits)
+        if title:
+            ax.set_title(title)
 
 
-def AnnotateUpstreamKinases(model, clusters, ax, data, num_hits=1):
+def AnnotateUpstreamKinases(model: DDMC, clusters, ax, data, num_hits: int = 1):
     """Annotate upstream kinase predictions"""
     data.iloc[:, 1] = data.iloc[:, 1].astype(str)
     pssms, _ = model.pssms()

diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
@@ -1,12 +1,11 @@
 """
 This creates Figure 2: Evaluation of Imputating Missingness
 """
-import matplotlib
 import numpy as np
 from scipy.stats import gmean
 import pandas as pd
 import seaborn as sns
-from .common import subplotLabel, getSetup
+from .common import getSetup
 from ..clustering import DDMC
 from ..pre_processing import filter_NaNpeptides
 from fancyimpute import IterativeSVD
@@ -17,19 +16,6 @@ def makeFigure():
     # Get list of axis objects
     ax, f = getSetup((10, 10), (3, 3), multz={0: 2})
 
-    # Set plotting format
-    matplotlib.rcParams["font.sans-serif"] = "Arial"
-    sns.set(
-        style="whitegrid",
-        font_scale=1,
-        color_codes=True,
-        palette="colorblind",
-        rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
-    )
-
-    # Add subplot labels
-    subplotLabel(ax)
-
     # diagram explaining reconstruction process
     ax[0].axis("off")
 

diff --git a/ddmc/figures/figureM3.py b/ddmc/figures/figureM3.py
@@ -6,10 +6,9 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-import matplotlib
 from ..clustering import DDMC
 from ..binomial import AAlist
-from .common import subplotLabel, getSetup
+from .common import getSetup
 from ..pca import plotPCA
 from .common import plotDistanceToUpstreamKinase, plotMotifs
 from ..clustering import compute_control_pssm
@@ -23,19 +22,6 @@ def makeFigure():
     # Get list of axis objects
     ax, f = getSetup((12, 12), (3, 3), multz={3: 1})
 
-    # Add subplot labels
-    subplotLabel(ax)
-
-    # Set plotting format
-    matplotlib.rcParams["font.sans-serif"] = "Arial"
-    sns.set(
-        style="whitegrid",
-        font_scale=1,
-        color_codes=True,
-        palette="colorblind",
-        rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
-    )
-
     # Import signaling data
     x = preprocess_ebdt_mcf7()
     d = x.select_dtypes(include=[float]).T
@@ -90,7 +76,7 @@ def makeFigure():
     erk2 = compute_control_pssm([s.upper() for s in erk2["Peptide"]])
     erk2 = pd.DataFrame(np.clip(erk2, a_min=0, a_max=3))
     erk2.index = AAlist
-    plotMotifs(erk2, axes=ax[5], titles="ERK2")
+    plotMotifs(erk2, ax=ax[5], titles="ERK2")
 
     # ERK2 prediction
     # Import signaling data

diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py
@@ -5,12 +5,11 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-import matplotlib
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import mean_squared_error
 from ..clustering import DDMC
-from .common import subplotLabel, getSetup, HotColdBehavior
+from .common import getSetup, HotColdBehavior, getDDMC_CPTAC
 from ..logistic_regression import plotROC
 from ..pre_processing import filter_NaNpeptides
 
@@ -20,25 +19,11 @@ def makeFigure():
     # Get list of axis objects
     ax, f = getSetup((8, 4), (1, 3))
 
-    # Add subplot labels
-    subplotLabel(ax)
-
-    # Set plotting format
-    matplotlib.rcParams["font.sans-serif"] = "Arial"
-    sns.set(
-        style="whitegrid",
-        font_scale=0.5,
-        color_codes=True,
-        palette="colorblind",
-        rc={"grid.linestyle": "dotted", "axes.linewidth": 0.6},
-    )
-
     X = filter_NaNpeptides(
         pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
         tmt=2,
     )
     d = X.select_dtypes(include=[float]).T
-    i = X["Sequence"]
 
     return f  # TODO: This code is broken.
 
@@ -61,15 +46,9 @@ def makeFigure():
     ax[0].legend(prop={"size": 5}, loc=0)
 
     # Fit Data, Mix, and Seq Models
-    dataM = DDMC(
-        i, n_components=30, SeqWeight=0, distance_method="Binomial", random_state=5
-    ).fit(d)
-    mixM = DDMC(
-        i, n_components=30, SeqWeight=250, distance_method="Binomial", random_state=5
-    ).fit(d)
-    seqM = DDMC(
-        i, n_components=30, SeqWeight=1e6, distance_method="Binomial", random_state=5
-    ).fit(d)
+    dataM = getDDMC_CPTAC(n_components=30, SeqWeight=0.0)
+    mixM = getDDMC_CPTAC(n_components=30, SeqWeight=250.0)
+    seqM = getDDMC_CPTAC(n_components=30, SeqWeight=1.0e6)
     models = [dataM, mixM, seqM]
 
     # Center to peptide distance
@@ -81,7 +60,7 @@ def makeFigure():
     return f
 
 
-def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35):
+def calculate_AUCs_phenotypes(ax, X: pd.DataFrame, nRuns=3, n_components=35):
     """Plot mean AUCs per phenotype across weights."""
     # Signaling
     d = X.select_dtypes(include=[float]).T
@@ -124,7 +103,7 @@ def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35):
                     ax,
                     lr,
                     centers_gen.values,
-                    y["STK11.mutation.status"],
+                    y["STK11.mutation.status"].values,
                     cv_folds=3,
                     return_mAUC=True,
                     kfold="Repeated",
@@ -238,7 +217,7 @@ def merge_binary_vectors(y, mutant1, mutant2):
     return pd.Series(y_)
 
 
-def find_patients_with_NATandTumor(X, label, conc=False):
+def find_patients_with_NATandTumor(X: pd.DataFrame, label, conc=False) -> pd.DataFrame:
     """Reshape data to display patients as rows and samples (Tumor and NAT per cluster) as columns.
     Note that to do so, samples that don't have their tumor/NAT counterpart are dropped.
     """
@@ -259,7 +238,7 @@ def find_patients_with_NATandTumor(X, label, conc=False):
     return X
 
 
-def TransformCenters(model, X):
+def TransformCenters(model: DDMC, X: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     """For a given model, find centers and transform for regression."""
     centers = pd.DataFrame(model.transform()).T
     centers.iloc[:, :] = StandardScaler(with_std=False).fit_transform(