Fix various issues

meyer-lab · Jan 19, 2024 · da585f5 · da585f5
1 parent a9b07ac
commit da585f5
Show file tree

Hide file tree

Showing 14 changed files with 100 additions and 37 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -6,15 +6,13 @@ jobs:
   build:
     runs-on: self-hosted
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install dependencies
-      run: |
-        make clean
-        make venv
+      run: poetry install
     - name: Build figures
       run: make -j 3 all
     - name: Upload files
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
       with:
         name: files
         path: output
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -6,14 +6,13 @@ jobs:
   build:
     runs-on: self-hosted
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install dependencies
-      run: make venv
+      run: poetry install
     - name: Test with pytest
-      run: |
-        . venv/bin/activate && pytest --cov=ddmc --cov-report=xml --cov-config=.github/workflows/coveragerc
+      run: make coverage.xml
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v2
+      uses: codecov/codecov-action@v3
       with:
         file: ./coverage.xml
         flags: unittests

diff --git a/ddmc/clustering.py b/ddmc/clustering.py
@@ -1,5 +1,6 @@
 """ Clustering functions. """
 
+from typing import Literal
 import warnings
 from copy import deepcopy
 import itertools
@@ -22,7 +23,12 @@ class DDMC(GaussianMixture):
     should have a larger effect on the peptide assignment."""
 
     def __init__(
-        self, info, n_components, SeqWeight, distance_method, random_state=None
+        self,
+        info: pd.DataFrame,
+        n_components: int,
+        SeqWeight: float,
+        distance_method: Literal["PAM250", "Binomial"],
+        random_state=None,
     ):
         super().__init__(
             n_components=n_components,
@@ -40,7 +46,7 @@ def __init__(
         seqs = [s.upper() for s in info["Sequence"]]
 
         if distance_method == "PAM250":
-            self.seqDist = PAM250(seqs)
+            self.seqDist: PAM250 | Binomial = PAM250(seqs)
         elif distance_method == "Binomial":
             self.seqDist = Binomial(info["Sequence"], seqs)
         else:

diff --git a/ddmc/figures/common.py b/ddmc/figures/common.py
@@ -9,9 +9,7 @@
 import svgutils.transform as st
 import numpy as np
 import pandas as pd
-import scipy as sp
 import seaborn as sns
-import textwrap
 import mygene
 from matplotlib import gridspec, pyplot as plt
 from string import ascii_uppercase
@@ -468,13 +466,13 @@ def ExportClusterFile(cluster, cptac=False, mcf7=False):
     """Export cluster SVG file for NetPhorest and GO analysis."""
     if cptac:
         c = pd.read_csv(
-            "msresist/data/cluster_members/CPTAC_DDMC_35CL_W100_MembersCluster"
+            "ddmc/data/cluster_members/CPTAC_DDMC_35CL_W100_MembersCluster"
             + str(cluster)
             + ".csv"
         )
     if mcf7:
         c = pd.read_csv(
-            "msresist/data/cluster_members/msresist/data/cluster_members/CPTAC_MF7_20CL_W5_MembersCluster"
+            "ddmc/data/cluster_members/msresist/data/cluster_members/CPTAC_MF7_20CL_W5_MembersCluster"
             + str(cluster)
             + ".csv"
         )
@@ -622,7 +620,7 @@ def TransformCenters(model, X):
 def HotColdBehavior(centers):
     # Import Cold-Hot Tumor data
     y = (
-        pd.read_csv("msresist/data/CPTAC_LUAD/Hot_Cold.csv")
+        pd.read_csv("ddmc/data/CPTAC_LUAD/Hot_Cold.csv")
         .dropna(axis=1)
         .sort_values(by="Sample ID")
     )

diff --git a/ddmc/figures/figureM2.py b/ddmc/figures/figureM2.py
@@ -128,16 +128,14 @@ def ErrorAcross(distance_method, weights, n_clusters, n_runs=1, tmt=6):
     """Calculate missingness error across different number of clusters."""
     assert len(weights) == len(n_clusters)
     X = filter_NaNpeptides(
-        pd.read_csv("msresist/data/CPTAC_LUAD/CPTAC-preprocessedMotfis.csv").iloc[
-            :, 1:
-        ],
+        pd.read_csv("ddmc/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:],
         tmt=tmt,
     )
     X.index = np.arange(X.shape[0])
     md = X.copy()
     info = md.select_dtypes(include=["object"])
     X = X.select_dtypes(include=["float64"])
-    StoE = pd.read_csv("msresist/data/CPTAC_LUAD/IDtoExperiment.csv")
+    StoE = pd.read_csv("ddmc/data/MS/CPTAC/IDtoExperiment.csv")
     assert all(StoE.iloc[:, 0] == X.columns), "Sample labels don't match."
     X = X.to_numpy()
     tmtIDX = StoE["Experiment (TMT10plex)"].to_numpy()
@@ -166,7 +164,9 @@ def ErrorAcross(distance_method, weights, n_clusters, n_runs=1, tmt=6):
             dfs = pd.Series(
                 [ii, cluster, weights[jj], eDDMC, *baseline_errors], index=df.columns
             )
-            df = df.append(dfs, ignore_index=True)
+            print(df)
+            print(dfs)
+            df = pd.concat([df, dfs], ignore_index=True)
 
     return df
 

diff --git a/ddmc/figures/figureM4.py b/ddmc/figures/figureM4.py
@@ -41,7 +41,7 @@ def makeFigure():
     i = X.select_dtypes(include=[object])
 
     # Plot mean AUCs per model
-    p = pd.read_csv("ddmc/data/Validations/preds_phenotypes_rs_15cl.csv").iloc[:, 1:]
+    p = pd.read_csv("ddmc/data/Performance/preds_phenotypes_rs_15cl.csv").iloc[:, 1:]
     p = p.melt(
         id_vars=["Run", "Weight"],
         value_vars=d.columns[2:],
@@ -110,7 +110,7 @@ def calculate_AUCs_phenotypes(ax, X, nRuns=3, n_components=35):
             run.append(r)
             ws.append(w)
             model = DDMC(
-                i, n_components=ncl, SeqWeight=w, distance_method="Binomial"
+                i, n_components=n_components, SeqWeight=w, distance_method="Binomial"
             ).fit(d)
 
             # Find and scale centers

diff --git a/ddmc/figures/figureM5.py b/ddmc/figures/figureM5.py
@@ -238,7 +238,6 @@ def plot_clusters_binaryfeatures(centers, id_var, ax, pvals=False, loc="best"):
         dodge=True,
         ax=ax,
         linewidth=0.25,
-        fliersize=2,
     )
     ax.legend(prop={"size": 8}, loc=loc)
 

diff --git a/ddmc/figures/figureM7.py b/ddmc/figures/figureM7.py
@@ -9,13 +9,12 @@
 import textwrap
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.preprocessing import StandardScaler
-from .common import subplotLabel, getSetup
+from .common import subplotLabel, getSetup, plotDistanceToUpstreamKinase
 from .figureM5 import (
     build_pval_matrix,
     calculate_mannW_pvals,
     plot_clusters_binaryfeatures,
 )
-from .commmon import plotDistanceToUpstreamKinase
 from ..clustering import DDMC
 from ..logistic_regression import plotROC, plotClusterCoefficients
 from ..pre_processing import filter_NaNpeptides

diff --git a/ddmc/figures/figureMS7.py b/ddmc/figures/figureMS7.py
@@ -5,7 +5,6 @@
 import matplotlib
 import numpy as np
 import pandas as pd
-from scipy.sparse.construct import random
 import seaborn as sns
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.cluster import KMeans

diff --git a/ddmc/gsea.py b/ddmc/gsea.py
@@ -2,10 +2,7 @@
 All functions relaed to GSEA analysis of clusters
 """
 
-import pickle
-import pandas as pd
 import mygene
-from ddmc.pre_processing import preprocessing, filter_NaNpeptides
 
 
 path = "/Users/creixell/Desktop/"

diff --git a/ddmc/motifs.py b/ddmc/motifs.py
@@ -3,16 +3,12 @@
 import glob
 import pandas as pd
 import numpy as np
-import os
 import re
 from Bio import SeqIO
 from Bio.Seq import Seq
 from .binomial import AAlist
 
 
-path = os.path.dirname(os.path.abspath(__file__))
-
-
 def MapMotifs(X, names):
     """Generate pY motifs for pre-processing."""
     names, seqs, pXpos, Xidx = GeneratingKinaseMotifs(names, FormatSeq(X))
@@ -153,7 +149,7 @@ def GeneratingKinaseMotifs(names, seqs):
     """Main function to generate motifs using 'findmotif'."""
     motif_size = 5
     proteome = open(
-        os.path.join(path, "./data/Sequence_analysis/proteome_uniprot2019.fa"), "r"
+        "./data/Sequence_analysis/proteome_uniprot2019.fa", "r"
     )
     ProteomeDict = DictProteomeNameToSeq(proteome, n="gene")
     protnames, seqs, Xidx = MatchProtNames(ProteomeDict, names, seqs)

diff --git a/makefile b/makefile
@@ -14,3 +14,6 @@ coverage.xml:
 
 clean:
 	rm -rf *.pdf pylint.log output
+
+mypy:
+	poetry run mypy --install-types --non-interactive --ignore-missing-imports ddmc
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ logomaker = "^0.8"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.4"
 pytest-cov = "^4.1.0"
+mypy = "^1.8.0"
 
 [build-system]
 requires = ["poetry-core"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,3 +14,6 @@ coverage.xml: @@
     clean:
     	rm -rf *.pdf pylint.log output
+    mypy:
+    	poetry run mypy --install-types --non-interactive --ignore-missing-imports ddmc