Mondo and prop (#313)

* swapped out new file names * updated filenames for Symbol-to-Entrez * changed disgenet to mondo * added error handling for no monarch fly * added zscores and p-values to results tables * added stasmodels to do p-value adjustments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
krishnanlab · May 8, 2024 · 62cfc9a · 62cfc9a
1 parent 3499a60
commit 62cfc9a
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 38 deletions.
diff --git a/geneplexus/_config/config.py b/geneplexus/_config/config.py
@@ -26,7 +26,7 @@
 ALL_TASKS = ["IDconversion", "MachineLearning", "Similarities", "NetworkGraph", "OriginalGSCs"]
 ALL_NETWORKS = ["BioGRID", "STRING", "IMP"]
 ALL_FEATURES = ["SixSpeciesN2V"]
-ALL_GSCS = ["GO", "Monarch", "DisGeNet", "Combined"]
+ALL_GSCS = ["GO", "Monarch", "Mondo", "Combined"]
 ALL_SPECIES = ["Human", "Mouse", "Fly", "Worm", "Fish", "Yeast"]
 
 DEFAULT_LOGREG_KWARGS: Dict[str, Any] = {
@@ -53,7 +53,7 @@
 TASK_TYPE = Literal["IDconversion", "MachineLearning", "Similarities", "NetworkGraph", "OriginalGSCs"]
 NET_TYPE = Literal["BioGRID", "STRING", "IMP"]
 FEATURE_TYPE = Literal["SixSpeciesN2V"]
-GSC_TYPE = Literal["GO", "Monarch", "DisGeNet", "Combined"]
+GSC_TYPE = Literal["GO", "Monarch", "Mondo", "Combined"]
 SPECIES_TYPE = Literal["Human", "Mouse", "Fly", "Worm", "Fish", "Yeast"]
 
 TASK_SELECTION_TYPE = Union[Literal["All"], TASK_TYPE, List[TASK_TYPE]]

diff --git a/geneplexus/_config/data_filenames.txt b/geneplexus/_config/data_filenames.txt
@@ -1,13 +1,10 @@
-IDconversion__Mouse__Entrez-to-ENSG.json
 IDconversion__Mouse__Entrez-to-Name.json
 GSC__Zebrafish__GO__STRING.json
 GSC__Zebrafish__Combined__IMP.json
 GSC__Yeast__Combined__IMP.json
 GSC__Yeast__Monarch__BioGRID.json
-IDconversion__Fly__Entrez-to-ENSG.json
 GSC__Yeast__Monarch__STRING.json
 NodeOrder__Fly__BioGRID.txt
-GSC__Fly__Combined__IMP.json
 Data__Yeast__SixSpeciesN2V__IMP.npy
 NodeOrder__Mouse__IMP.txt
 GSC__Mouse__Combined__BioGRID.json
@@ -16,11 +13,9 @@ GSC__Human__Monarch__BioGRID.json
 GSC__Yeast__GO__IMP.json
 IDconversion__Zebrafish__ENSP-to-Entrez.json
 IDconversion__Worm__Entrez-to-Name.json
-IDconversion__Worm__Entrez-to-ENSG.json
-PreTrainedWeights__Fly__Combined__IMP__SixSpeciesN2V.json
+GSC__Human__Mondo__STRING.json
 Edgelist__Human__IMP.edg
 IDconversion__Human__ENSP-to-Entrez.json
-PreTrainedWeights__Human__DisGeNet__STRING__SixSpeciesN2V.json
 GSC__Mouse__Monarch__IMP.json
 PreTrainedWeights__Yeast__Combined__BioGRID__SixSpeciesN2V.json
 NodeOrder__Mouse__STRING.txt
@@ -34,7 +29,6 @@ GSC__Human__GO__BioGRID.json
 PreTrainedWeights__Yeast__GO__STRING__SixSpeciesN2V.json
 GSC__Zebrafish__GO__IMP.json
 Edgelist__Worm__BioGRID.edg
-GSC__Fly__Monarch__BioGRID.json
 PreTrainedWeights__Mouse__Combined__STRING__SixSpeciesN2V.json
 IDconversion__Mouse__Entrez-to-Symbol.json
 PreTrainedWeights__Zebrafish__Monarch__IMP__SixSpeciesN2V.json
@@ -48,23 +42,18 @@ NodeOrder__Zebrafish__IMP.txt
 GSC__Yeast__Combined__BioGRID.json
 IDconversion__Mouse__ENST-to-Entrez.json
 IDconversion__Human__ENST-to-Entrez.json
-PreTrainedWeights__Fly__Monarch__IMP__SixSpeciesN2V.json
 IDconversion__Fly__Entrez-to-Name.json
 NodeOrder__Worm__STRING.txt
 PreTrainedWeights__Mouse__Combined__IMP__SixSpeciesN2V.json
 PreTrainedWeights__Yeast__GO__BioGRID__SixSpeciesN2V.json
-GSC__Fly__Combined__STRING.json
 IDconversion__Zebrafish__ENSG-to-Entrez.json
-GSC__Human__DisGeNet__IMP.json
 NodeOrder__Worm__IMP.txt
 IDconversion__Human__Entrez-to-Name.json
 Edgelist__Yeast__IMP.edg
 GSC__Mouse__Combined__STRING.json
 Data__Fly__SixSpeciesN2V__BioGRID.npy
 GSC__Mouse__GO__IMP.json
-PreTrainedWeights__Human__DisGeNet__IMP__SixSpeciesN2V.json
 Data__Worm__SixSpeciesN2V__STRING.npy
-PreTrainedWeights__Fly__Combined__STRING__SixSpeciesN2V.json
 Data__Fly__SixSpeciesN2V__IMP.npy
 PreTrainedWeights__Mouse__Monarch__BioGRID__SixSpeciesN2V.json
 NodeOrder__Human__BioGRID.txt
@@ -74,7 +63,6 @@ GSC__Worm__Combined__IMP.json
 Data__Mouse__SixSpeciesN2V__BioGRID.npy
 Data__Mouse__SixSpeciesN2V__STRING.npy
 IDconversion__Worm__ENSP-to-Entrez.json
-GSC__Fly__Combined__BioGRID.json
 GSC__Human__Combined__IMP.json
 NodeOrder__Yeast__STRING.txt
 Edgelist__Yeast__STRING.edg
@@ -88,8 +76,8 @@ NodeOrder__Human__IMP.txt
 IDconversion__Zebrafish__Entrez-to-Name.json
 GSC__Mouse__GO__BioGRID.json
 NodeOrder__Worm__BioGRID.txt
+PreTrainedWeights__Human__Mondo__STRING__SixSpeciesN2V.json
 Edgelist__Fly__IMP.edg
-PreTrainedWeights__Fly__Monarch__STRING__SixSpeciesN2V.json
 IDconversion__Human__Symbol-to-Entrez.json
 GSC__Human__Combined__BioGRID.json
 NodeOrder__Yeast__IMP.txt
@@ -113,7 +101,9 @@ Edgelist__Yeast__BioGRID.edg
 IDconversion__Fly__ENST-to-Entrez.json
 NodeOrder__Yeast__BioGRID.txt
 IDconversion__Fly__ENSP-to-Entrez.json
+PreTrainedWeights__Human__Mondo__IMP__SixSpeciesN2V.json
 IDconversion__Fly__Entrez-to-Symbol.json
+GSC__Human__Mondo__BioGRID.json
 GSC__Yeast__GO__BioGRID.json
 PreTrainedWeights__Yeast__Monarch__STRING__SixSpeciesN2V.json
 GSC__Fly__GO__IMP.json
@@ -127,22 +117,20 @@ GSC__Yeast__GO__STRING.json
 Data__Mouse__SixSpeciesN2V__IMP.npy
 PreTrainedWeights__Human__Combined__BioGRID__SixSpeciesN2V.json
 IDconversion__Mouse__ENSG-to-Entrez.json
-GSC__Fly__Monarch__STRING.json
 GSC__Worm__Combined__BioGRID.json
 NodeOrder__Fly__STRING.txt
 GSC__Worm__Monarch__IMP.json
 IDconversion__Yeast__ENSG-to-Entrez.json
 IDconversion__Yeast__Symbol-to-Entrez.json
 GSC__Mouse__GO__STRING.json
-IDconversion__Human__Entrez-to-ENSG.json
 GSC__Worm__Monarch__STRING.json
 PreTrainedWeights__Human__Monarch__BioGRID__SixSpeciesN2V.json
-IDconversion__Zebrafish__Entrez-to-ENSG.json
 PreTrainedWeights__Worm__Combined__STRING__SixSpeciesN2V.json
 NodeOrder__Mouse__BioGRID.txt
 GSC__Worm__Monarch__BioGRID.json
 PreTrainedWeights__Zebrafish__GO__IMP__SixSpeciesN2V.json
 PreTrainedWeights__Mouse__Monarch__IMP__SixSpeciesN2V.json
+PreTrainedWeights__Human__Mondo__BioGRID__SixSpeciesN2V.json
 PreTrainedWeights__Zebrafish__Monarch__STRING__SixSpeciesN2V.json
 NodeOrder__Fly__IMP.txt
 PreTrainedWeights__Worm__GO__BioGRID__SixSpeciesN2V.json
@@ -156,14 +144,12 @@ GSC__Zebrafish__Monarch__STRING.json
 GSC__Human__Monarch__IMP.json
 Data__Human__SixSpeciesN2V__BioGRID.npy
 GSC__Human__GO__IMP.json
-GSC__Fly__Monarch__IMP.json
 Edgelist__Fly__STRING.edg
 PreTrainedWeights__Yeast__GO__IMP__SixSpeciesN2V.json
 Data__Worm__SixSpeciesN2V__BioGRID.npy
 Edgelist__Mouse__STRING.edg
 Edgelist__Mouse__BioGRID.edg
 IDconversion__Worm__Symbol-to-Entrez.json
-PreTrainedWeights__Human__DisGeNet__BioGRID__SixSpeciesN2V.json
 PreTrainedWeights__Human__GO__BioGRID__SixSpeciesN2V.json
 Data__Fly__SixSpeciesN2V__STRING.npy
 Data__Yeast__SixSpeciesN2V__STRING.npy
@@ -172,25 +158,21 @@ GSC__Human__Monarch__STRING.json
 Data__Zebrafish__SixSpeciesN2V__IMP.npy
 IDconversion__Fly__ENSG-to-Entrez.json
 PreTrainedWeights__Human__Combined__STRING__SixSpeciesN2V.json
-PreTrainedWeights__Fly__Combined__BioGRID__SixSpeciesN2V.json
 IDconversion__Mouse__ENSP-to-Entrez.json
+GSC__Human__Mondo__IMP.json
 NodeOrder__Human__STRING.txt
 PreTrainedWeights__Human__GO__IMP__SixSpeciesN2V.json
 Edgelist__Worm__STRING.edg
-IDconversion__Yeast__Entrez-to-ENSG.json
-GSC__Human__DisGeNet__BioGRID.json
 GSC__Human__GO__STRING.json
 GSC__Worm__GO__BioGRID.json
 PreTrainedWeights__Mouse__GO__IMP__SixSpeciesN2V.json
-GSC__Human__DisGeNet__STRING.json
 Edgelist__Zebrafish__STRING.edg
 Data__Human__SixSpeciesN2V__IMP.npy
 IDconversion__Zebrafish__Symbol-to-Entrez.json
 PreTrainedWeights__Worm__Combined__BioGRID__SixSpeciesN2V.json
 PreTrainedWeights__Worm__GO__STRING__SixSpeciesN2V.json
 IDconversion__Yeast__ENSP-to-Entrez.json
 IDconversion__Worm__ENSG-to-Entrez.json
-PreTrainedWeights__Fly__Monarch__BioGRID__SixSpeciesN2V.json
 PreTrainedWeights__Mouse__Combined__BioGRID__SixSpeciesN2V.json
 IDconversion__Fly__Symbol-to-Entrez.json
 PreTrainedWeights__Human__Monarch__IMP__SixSpeciesN2V.json

diff --git a/geneplexus/_geneplexus.py b/geneplexus/_geneplexus.py
@@ -7,12 +7,14 @@
 import pandas as pd
 from scipy.spatial.distance import cosine
 from scipy.stats import hypergeom
+from scipy.stats import norm
 from scipy.stats import rankdata
 from scipy.stats import zscore
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import average_precision_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.preprocessing import StandardScaler
+from statsmodels.stats.multitest import multipletests
 
 from . import util
 from ._config import logger
@@ -193,16 +195,21 @@ def _make_prob_df(file_loc, sp_trn, sp_tst, net_type, probs, pos_genes_in_net, n
         syms_tmp = util.mapgene(net_genes[idx], Entrez_to_Symbol)
         name_tmp = util.mapgene(net_genes[idx], Entrez_to_Name)
         if sp_trn == sp_tst:
-            prob_results.append([net_genes[idx], syms_tmp, name_tmp, probs[idx], novel_label, class_label])
+            prob_results.append([net_genes[idx], syms_tmp, name_tmp, novel_label, class_label, probs[idx]])
         else:
             prob_results.append([net_genes[idx], syms_tmp, name_tmp, probs[idx]])
     if sp_trn == sp_tst:
-        df_col_names = ["Entrez", "Symbol", "Name", "Probability", "Known/Novel", "Class-Label"]
+        df_col_names = ["Entrez", "Symbol", "Name", "Known/Novel", "Class-Label", "Probability"]
     else:
         df_col_names = ["Entrez", "Symbol", "Name", "Probability"]
     df_probs = pd.DataFrame(prob_results, columns=df_col_names)
     df_probs = df_probs.astype({"Entrez": str, "Probability": float})
     df_probs = df_probs.sort_values(by=["Probability"], ascending=False).reset_index(drop=True)
+    z = zscore(df_probs["Probability"].to_numpy())
+    p = norm.sf(abs(z))
+    rejects, padjusts, b, c = multipletests(p, method="bonferroni", is_sorted=True)
+    df_probs["Z-score"] = z
+    df_probs["P-adjusted"] = padjusts
     df_probs["Rank"] = rankdata(1 / (df_probs["Probability"].to_numpy() + 1e-9), method="min")
     return df_probs
 
@@ -221,12 +228,16 @@ def _make_sim_dfs(file_loc, mdl_weights, species, gsc, net_type, features):
     for idx2, termID_tmp in enumerate(gsc_terms):
         ID_tmp = termID_tmp
         Name_tmp = weights_dict[termID_tmp]["Name"]
+        mdl_sim_tmp = mdl_sims[idx2]
         z_tmp = z[idx2]
-        results_tmp.append([ID_tmp, Name_tmp, z_tmp])
-    df_sim = pd.DataFrame(results_tmp, columns=["ID", "Name", "Similarity"]).sort_values(
+        results_tmp.append([ID_tmp, Name_tmp, mdl_sim_tmp, z_tmp])
+    df_sim = pd.DataFrame(results_tmp, columns=["ID", "Name", "Similarity", "Z-score"]).sort_values(
         by=["Similarity"],
         ascending=False,
     )
+    p = norm.sf(abs(df_sim["Z-score"].to_numpy()))
+    rejects, padjusts, b, c = multipletests(p, method="bonferroni", is_sorted=True)
+    df_sim["P-adjusted"] = padjusts
     df_sim["Rank"] = rankdata(-1 * (df_sim["Similarity"].to_numpy() + 1e-9), method="min")
     return df_sim, weights_dict
 
@@ -235,11 +246,16 @@ def _make_small_edgelist(file_loc, df_probs, species, net_type, num_nodes=50):
     # This will set the max number of genes to look at to a given number
     # Load network as edge list dataframe
     filepath = osp.join(file_loc, f"Edgelist__{species}__{net_type}.edg")
-    df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2"])
+    if net_type == "BioGRID":
+        df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2"])
+    else:
+        df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2", "Weight"])
     df_edge = df_edge.astype({"Node1": str, "Node2": str})
     # Take subgraph induced by top genes
     top_genes = df_probs["Entrez"].to_numpy()[:num_nodes]
     df_edge = df_edge[(df_edge["Node1"].isin(top_genes)) & (df_edge["Node2"].isin(top_genes))]
+    if net_type == "BioGRID":
+        df_edge["Weight"] = [1.0] * df_edge.shape[0]
     genes_in_edge = np.union1d(df_edge["Node1"].unique(), df_edge["Node2"].unique())
     isolated_genes = np.setdiff1d(top_genes, genes_in_edge).tolist()
     # Convert to gene symbol

diff --git a/geneplexus/cli.py b/geneplexus/cli.py
@@ -170,7 +170,7 @@ def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):
         num_nodes: Number of top predicted genes to include in the induced
             subgraph.
         skip_mdl_sim: Whether or not to skip the computation of model
-            similarities with GO and DisGeNet. This option is not yet available
+            similarities with GO and Mondo. This option is not yet available
             for custom networks.
 
     """
@@ -203,7 +203,7 @@ def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim):
         zip_output: Whether or not to zip the output directory into a zip file.
         overwrite: Whether or not to overwrite existing results.
         skip_mdl_sim: Whether or not to skip the computation of model
-            similarities with GO and DisGeNet. This option is not yet available
+            similarities with GO and Mondo. This option is not yet available
             for custom networks.
 
     """

diff --git a/geneplexus/custom.py b/geneplexus/custom.py
@@ -116,7 +116,7 @@ def subset_gsc_to_network(
 
     Note:
         Use the :meth:`geneplexus.download.download_select_data` function to
-        get the preprocessed GO and DisGeNet files first.
+        get the preprocessed GO and Mondo files first.
 
     Args:
         data_dir: The directory to save the file

diff --git a/geneplexus/exception.py b/geneplexus/exception.py
@@ -1,6 +1,10 @@
 """GenePlexus exceptions."""
 
 
+class FlyMonarchError(Exception):
+    """Raised becasue no Monarch annotations for Fly."""
+
+
 class ZebrafishBioGRIDError(Exception):
     """Raised when Zebrafish + BioGRID is tried."""
 

diff --git a/geneplexus/geneplexus.py b/geneplexus/geneplexus.py
@@ -17,6 +17,7 @@
 from ._config.logger_util import set_stream_level
 from .download import download_select_data
 from .exception import CustomDataError
+from .exception import FlyMonarchError
 from .exception import ZebrafishBioGRIDError
 
 
@@ -83,7 +84,7 @@ def __init__(
                 "All",
                 self.net_type,
                 self.features,
-                ["GO", "DisGeNet"],
+                ["GO", "Mondo"],
                 log_level=log_level,
             )
 
@@ -97,6 +98,16 @@ def __init__(
                 "so this combination is not allowed.",
             )
 
+        if (
+            (self.sp_trn == "Fly" and self.gsc_trn == "Monarch")
+            or (self.sp_trn == "Fly" and self.gsc_trn == "Combined")
+            or (self.sp_tst == "Fly" and self.gsc_tst == "Monarch")
+            or (self.sp_tst == "Fly" and self.gsc_tst == "Combined")
+        ):
+            raise FlyMonarchError(
+                f"Fly has no annotations for Monarch.",
+            )
+
     @property
     def _params(self) -> List[str]:
         return [
@@ -371,10 +382,10 @@ def _get_pos_and_neg_genes(self):
         return self.pos_genes_in_net, self.negative_genes, self.net_genes
 
     def make_sim_dfs(self):
-        """Compute similarities bewteen the input genes and GO or DisGeNet.
+        """Compute similarities bewteen the input genes and GO or Mondo.
 
         The similarities are compuared based on the model trained on the input
-        gene set and models pre-trained on known GO and DisGeNet gene sets.
+        gene set and models pre-trained on known GO and Mondo gene sets.
 
         :attr:`GenePlexus.df_sim_GO` (DataFrame)
             A table with 4 columns: **ID** (the GO term ID), **Name** (name of
@@ -394,7 +405,7 @@ def make_sim_dfs(self):
             the GO term), **Weights** (pretrained model weights), **PosGenes**
             (positive genes for this GO term).
         :attr:`GenePlexus.weights_Dis`
-            Dictionary of pretrained model weights for DisGeNet. A key is a DO
+            Dictionary of pretrained model weights for Mondo. A key is a DO
             term, and the value is a dictionary with three keys: **Name** (name
             of the DO term), **Weights** (pretrained model weights),
             **PosGenes** (positive genes for this DO term).

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 numpy==1.23.5
 requests==2.31.0
 scikit-learn==1.3.0
+statsmodels==0.14.2
 tqdm==4.65.0
 pystow==0.5.0
 pyyaml==6.0.1
diff --git a/setup.cfg b/setup.cfg
@@ -40,6 +40,7 @@ install_requires =
     scikit-learn >= 1.0.0
     scipy >= 1.6.2
     pandas >= 1.2.4
+	statsmodels >= 0.14.0
     requests
     tqdm
     pystow