Skip to content

Commit

Permalink
Mondo and prop (#313)
Browse files Browse the repository at this point in the history
* swapped out new file names

* updated filenames for Symbol-to-Entrez

* changed disgenet to mondo

* added error handling for no monarch fly

* added zscores and p-values to results tables

* added stasmodels to do p-value adjustments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ChristopherMancuso and pre-commit-ci[bot] authored May 8, 2024
1 parent 3499a60 commit 62cfc9a
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 38 deletions.
4 changes: 2 additions & 2 deletions geneplexus/_config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
ALL_TASKS = ["IDconversion", "MachineLearning", "Similarities", "NetworkGraph", "OriginalGSCs"]
ALL_NETWORKS = ["BioGRID", "STRING", "IMP"]
ALL_FEATURES = ["SixSpeciesN2V"]
ALL_GSCS = ["GO", "Monarch", "DisGeNet", "Combined"]
ALL_GSCS = ["GO", "Monarch", "Mondo", "Combined"]
ALL_SPECIES = ["Human", "Mouse", "Fly", "Worm", "Fish", "Yeast"]

DEFAULT_LOGREG_KWARGS: Dict[str, Any] = {
Expand All @@ -53,7 +53,7 @@
TASK_TYPE = Literal["IDconversion", "MachineLearning", "Similarities", "NetworkGraph", "OriginalGSCs"]
NET_TYPE = Literal["BioGRID", "STRING", "IMP"]
FEATURE_TYPE = Literal["SixSpeciesN2V"]
GSC_TYPE = Literal["GO", "Monarch", "DisGeNet", "Combined"]
GSC_TYPE = Literal["GO", "Monarch", "Mondo", "Combined"]
SPECIES_TYPE = Literal["Human", "Mouse", "Fly", "Worm", "Fish", "Yeast"]

TASK_SELECTION_TYPE = Union[Literal["All"], TASK_TYPE, List[TASK_TYPE]]
Expand Down
30 changes: 6 additions & 24 deletions geneplexus/_config/data_filenames.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
IDconversion__Mouse__Entrez-to-ENSG.json
IDconversion__Mouse__Entrez-to-Name.json
GSC__Zebrafish__GO__STRING.json
GSC__Zebrafish__Combined__IMP.json
GSC__Yeast__Combined__IMP.json
GSC__Yeast__Monarch__BioGRID.json
IDconversion__Fly__Entrez-to-ENSG.json
GSC__Yeast__Monarch__STRING.json
NodeOrder__Fly__BioGRID.txt
GSC__Fly__Combined__IMP.json
Data__Yeast__SixSpeciesN2V__IMP.npy
NodeOrder__Mouse__IMP.txt
GSC__Mouse__Combined__BioGRID.json
Expand All @@ -16,11 +13,9 @@ GSC__Human__Monarch__BioGRID.json
GSC__Yeast__GO__IMP.json
IDconversion__Zebrafish__ENSP-to-Entrez.json
IDconversion__Worm__Entrez-to-Name.json
IDconversion__Worm__Entrez-to-ENSG.json
PreTrainedWeights__Fly__Combined__IMP__SixSpeciesN2V.json
GSC__Human__Mondo__STRING.json
Edgelist__Human__IMP.edg
IDconversion__Human__ENSP-to-Entrez.json
PreTrainedWeights__Human__DisGeNet__STRING__SixSpeciesN2V.json
GSC__Mouse__Monarch__IMP.json
PreTrainedWeights__Yeast__Combined__BioGRID__SixSpeciesN2V.json
NodeOrder__Mouse__STRING.txt
Expand All @@ -34,7 +29,6 @@ GSC__Human__GO__BioGRID.json
PreTrainedWeights__Yeast__GO__STRING__SixSpeciesN2V.json
GSC__Zebrafish__GO__IMP.json
Edgelist__Worm__BioGRID.edg
GSC__Fly__Monarch__BioGRID.json
PreTrainedWeights__Mouse__Combined__STRING__SixSpeciesN2V.json
IDconversion__Mouse__Entrez-to-Symbol.json
PreTrainedWeights__Zebrafish__Monarch__IMP__SixSpeciesN2V.json
Expand All @@ -48,23 +42,18 @@ NodeOrder__Zebrafish__IMP.txt
GSC__Yeast__Combined__BioGRID.json
IDconversion__Mouse__ENST-to-Entrez.json
IDconversion__Human__ENST-to-Entrez.json
PreTrainedWeights__Fly__Monarch__IMP__SixSpeciesN2V.json
IDconversion__Fly__Entrez-to-Name.json
NodeOrder__Worm__STRING.txt
PreTrainedWeights__Mouse__Combined__IMP__SixSpeciesN2V.json
PreTrainedWeights__Yeast__GO__BioGRID__SixSpeciesN2V.json
GSC__Fly__Combined__STRING.json
IDconversion__Zebrafish__ENSG-to-Entrez.json
GSC__Human__DisGeNet__IMP.json
NodeOrder__Worm__IMP.txt
IDconversion__Human__Entrez-to-Name.json
Edgelist__Yeast__IMP.edg
GSC__Mouse__Combined__STRING.json
Data__Fly__SixSpeciesN2V__BioGRID.npy
GSC__Mouse__GO__IMP.json
PreTrainedWeights__Human__DisGeNet__IMP__SixSpeciesN2V.json
Data__Worm__SixSpeciesN2V__STRING.npy
PreTrainedWeights__Fly__Combined__STRING__SixSpeciesN2V.json
Data__Fly__SixSpeciesN2V__IMP.npy
PreTrainedWeights__Mouse__Monarch__BioGRID__SixSpeciesN2V.json
NodeOrder__Human__BioGRID.txt
Expand All @@ -74,7 +63,6 @@ GSC__Worm__Combined__IMP.json
Data__Mouse__SixSpeciesN2V__BioGRID.npy
Data__Mouse__SixSpeciesN2V__STRING.npy
IDconversion__Worm__ENSP-to-Entrez.json
GSC__Fly__Combined__BioGRID.json
GSC__Human__Combined__IMP.json
NodeOrder__Yeast__STRING.txt
Edgelist__Yeast__STRING.edg
Expand All @@ -88,8 +76,8 @@ NodeOrder__Human__IMP.txt
IDconversion__Zebrafish__Entrez-to-Name.json
GSC__Mouse__GO__BioGRID.json
NodeOrder__Worm__BioGRID.txt
PreTrainedWeights__Human__Mondo__STRING__SixSpeciesN2V.json
Edgelist__Fly__IMP.edg
PreTrainedWeights__Fly__Monarch__STRING__SixSpeciesN2V.json
IDconversion__Human__Symbol-to-Entrez.json
GSC__Human__Combined__BioGRID.json
NodeOrder__Yeast__IMP.txt
Expand All @@ -113,7 +101,9 @@ Edgelist__Yeast__BioGRID.edg
IDconversion__Fly__ENST-to-Entrez.json
NodeOrder__Yeast__BioGRID.txt
IDconversion__Fly__ENSP-to-Entrez.json
PreTrainedWeights__Human__Mondo__IMP__SixSpeciesN2V.json
IDconversion__Fly__Entrez-to-Symbol.json
GSC__Human__Mondo__BioGRID.json
GSC__Yeast__GO__BioGRID.json
PreTrainedWeights__Yeast__Monarch__STRING__SixSpeciesN2V.json
GSC__Fly__GO__IMP.json
Expand All @@ -127,22 +117,20 @@ GSC__Yeast__GO__STRING.json
Data__Mouse__SixSpeciesN2V__IMP.npy
PreTrainedWeights__Human__Combined__BioGRID__SixSpeciesN2V.json
IDconversion__Mouse__ENSG-to-Entrez.json
GSC__Fly__Monarch__STRING.json
GSC__Worm__Combined__BioGRID.json
NodeOrder__Fly__STRING.txt
GSC__Worm__Monarch__IMP.json
IDconversion__Yeast__ENSG-to-Entrez.json
IDconversion__Yeast__Symbol-to-Entrez.json
GSC__Mouse__GO__STRING.json
IDconversion__Human__Entrez-to-ENSG.json
GSC__Worm__Monarch__STRING.json
PreTrainedWeights__Human__Monarch__BioGRID__SixSpeciesN2V.json
IDconversion__Zebrafish__Entrez-to-ENSG.json
PreTrainedWeights__Worm__Combined__STRING__SixSpeciesN2V.json
NodeOrder__Mouse__BioGRID.txt
GSC__Worm__Monarch__BioGRID.json
PreTrainedWeights__Zebrafish__GO__IMP__SixSpeciesN2V.json
PreTrainedWeights__Mouse__Monarch__IMP__SixSpeciesN2V.json
PreTrainedWeights__Human__Mondo__BioGRID__SixSpeciesN2V.json
PreTrainedWeights__Zebrafish__Monarch__STRING__SixSpeciesN2V.json
NodeOrder__Fly__IMP.txt
PreTrainedWeights__Worm__GO__BioGRID__SixSpeciesN2V.json
Expand All @@ -156,14 +144,12 @@ GSC__Zebrafish__Monarch__STRING.json
GSC__Human__Monarch__IMP.json
Data__Human__SixSpeciesN2V__BioGRID.npy
GSC__Human__GO__IMP.json
GSC__Fly__Monarch__IMP.json
Edgelist__Fly__STRING.edg
PreTrainedWeights__Yeast__GO__IMP__SixSpeciesN2V.json
Data__Worm__SixSpeciesN2V__BioGRID.npy
Edgelist__Mouse__STRING.edg
Edgelist__Mouse__BioGRID.edg
IDconversion__Worm__Symbol-to-Entrez.json
PreTrainedWeights__Human__DisGeNet__BioGRID__SixSpeciesN2V.json
PreTrainedWeights__Human__GO__BioGRID__SixSpeciesN2V.json
Data__Fly__SixSpeciesN2V__STRING.npy
Data__Yeast__SixSpeciesN2V__STRING.npy
Expand All @@ -172,25 +158,21 @@ GSC__Human__Monarch__STRING.json
Data__Zebrafish__SixSpeciesN2V__IMP.npy
IDconversion__Fly__ENSG-to-Entrez.json
PreTrainedWeights__Human__Combined__STRING__SixSpeciesN2V.json
PreTrainedWeights__Fly__Combined__BioGRID__SixSpeciesN2V.json
IDconversion__Mouse__ENSP-to-Entrez.json
GSC__Human__Mondo__IMP.json
NodeOrder__Human__STRING.txt
PreTrainedWeights__Human__GO__IMP__SixSpeciesN2V.json
Edgelist__Worm__STRING.edg
IDconversion__Yeast__Entrez-to-ENSG.json
GSC__Human__DisGeNet__BioGRID.json
GSC__Human__GO__STRING.json
GSC__Worm__GO__BioGRID.json
PreTrainedWeights__Mouse__GO__IMP__SixSpeciesN2V.json
GSC__Human__DisGeNet__STRING.json
Edgelist__Zebrafish__STRING.edg
Data__Human__SixSpeciesN2V__IMP.npy
IDconversion__Zebrafish__Symbol-to-Entrez.json
PreTrainedWeights__Worm__Combined__BioGRID__SixSpeciesN2V.json
PreTrainedWeights__Worm__GO__STRING__SixSpeciesN2V.json
IDconversion__Yeast__ENSP-to-Entrez.json
IDconversion__Worm__ENSG-to-Entrez.json
PreTrainedWeights__Fly__Monarch__BioGRID__SixSpeciesN2V.json
PreTrainedWeights__Mouse__Combined__BioGRID__SixSpeciesN2V.json
IDconversion__Fly__Symbol-to-Entrez.json
PreTrainedWeights__Human__Monarch__IMP__SixSpeciesN2V.json
Expand Down
26 changes: 21 additions & 5 deletions geneplexus/_geneplexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.stats import hypergeom
from scipy.stats import norm
from scipy.stats import rankdata
from scipy.stats import zscore
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.multitest import multipletests

from . import util
from ._config import logger
Expand Down Expand Up @@ -193,16 +195,21 @@ def _make_prob_df(file_loc, sp_trn, sp_tst, net_type, probs, pos_genes_in_net, n
syms_tmp = util.mapgene(net_genes[idx], Entrez_to_Symbol)
name_tmp = util.mapgene(net_genes[idx], Entrez_to_Name)
if sp_trn == sp_tst:
prob_results.append([net_genes[idx], syms_tmp, name_tmp, probs[idx], novel_label, class_label])
prob_results.append([net_genes[idx], syms_tmp, name_tmp, novel_label, class_label, probs[idx]])
else:
prob_results.append([net_genes[idx], syms_tmp, name_tmp, probs[idx]])
if sp_trn == sp_tst:
df_col_names = ["Entrez", "Symbol", "Name", "Probability", "Known/Novel", "Class-Label"]
df_col_names = ["Entrez", "Symbol", "Name", "Known/Novel", "Class-Label", "Probability"]
else:
df_col_names = ["Entrez", "Symbol", "Name", "Probability"]
df_probs = pd.DataFrame(prob_results, columns=df_col_names)
df_probs = df_probs.astype({"Entrez": str, "Probability": float})
df_probs = df_probs.sort_values(by=["Probability"], ascending=False).reset_index(drop=True)
z = zscore(df_probs["Probability"].to_numpy())
p = norm.sf(abs(z))
rejects, padjusts, b, c = multipletests(p, method="bonferroni", is_sorted=True)
df_probs["Z-score"] = z
df_probs["P-adjusted"] = padjusts
df_probs["Rank"] = rankdata(1 / (df_probs["Probability"].to_numpy() + 1e-9), method="min")
return df_probs

Expand All @@ -221,12 +228,16 @@ def _make_sim_dfs(file_loc, mdl_weights, species, gsc, net_type, features):
for idx2, termID_tmp in enumerate(gsc_terms):
ID_tmp = termID_tmp
Name_tmp = weights_dict[termID_tmp]["Name"]
mdl_sim_tmp = mdl_sims[idx2]
z_tmp = z[idx2]
results_tmp.append([ID_tmp, Name_tmp, z_tmp])
df_sim = pd.DataFrame(results_tmp, columns=["ID", "Name", "Similarity"]).sort_values(
results_tmp.append([ID_tmp, Name_tmp, mdl_sim_tmp, z_tmp])
df_sim = pd.DataFrame(results_tmp, columns=["ID", "Name", "Similarity", "Z-score"]).sort_values(
by=["Similarity"],
ascending=False,
)
p = norm.sf(abs(df_sim["Z-score"].to_numpy()))
rejects, padjusts, b, c = multipletests(p, method="bonferroni", is_sorted=True)
df_sim["P-adjusted"] = padjusts
df_sim["Rank"] = rankdata(-1 * (df_sim["Similarity"].to_numpy() + 1e-9), method="min")
return df_sim, weights_dict

Expand All @@ -235,11 +246,16 @@ def _make_small_edgelist(file_loc, df_probs, species, net_type, num_nodes=50):
# This will set the max number of genes to look at to a given number
# Load network as edge list dataframe
filepath = osp.join(file_loc, f"Edgelist__{species}__{net_type}.edg")
df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2"])
if net_type == "BioGRID":
df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2"])
else:
df_edge = pd.read_csv(filepath, sep="\t", header=None, names=["Node1", "Node2", "Weight"])
df_edge = df_edge.astype({"Node1": str, "Node2": str})
# Take subgraph induced by top genes
top_genes = df_probs["Entrez"].to_numpy()[:num_nodes]
df_edge = df_edge[(df_edge["Node1"].isin(top_genes)) & (df_edge["Node2"].isin(top_genes))]
if net_type == "BioGRID":
df_edge["Weight"] = [1.0] * df_edge.shape[0]
genes_in_edge = np.union1d(df_edge["Node1"].unique(), df_edge["Node2"].unique())
isolated_genes = np.setdiff1d(top_genes, genes_in_edge).tolist()
# Convert to gene symbol
Expand Down
4 changes: 2 additions & 2 deletions geneplexus/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def run_pipeline(gp: GenePlexus, num_nodes: int, skip_mdl_sim: bool):
num_nodes: Number of top predicted genes to include in the induced
subgraph.
skip_mdl_sim: Whether or not to skip the computation of model
similarities with GO and DisGeNet. This option is not yet available
similarities with GO and Mondo. This option is not yet available
for custom networks.
"""
Expand Down Expand Up @@ -203,7 +203,7 @@ def save_results(gp, outdir, zip_output, overwrite, skip_mdl_sim):
zip_output: Whether or not to zip the output directory into a zip file.
overwrite: Whether or not to overwrite existing results.
skip_mdl_sim: Whether or not to skip the computation of model
similarities with GO and DisGeNet. This option is not yet available
similarities with GO and Mondo. This option is not yet available
for custom networks.
"""
Expand Down
2 changes: 1 addition & 1 deletion geneplexus/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def subset_gsc_to_network(
Note:
Use the :meth:`geneplexus.download.download_select_data` function to
get the preprocessed GO and DisGeNet files first.
get the preprocessed GO and Mondo files first.
Args:
data_dir: The directory to save the file
Expand Down
4 changes: 4 additions & 0 deletions geneplexus/exception.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""GenePlexus exceptions."""


class FlyMonarchError(Exception):
"""Raised becasue no Monarch annotations for Fly."""


class ZebrafishBioGRIDError(Exception):
"""Raised when Zebrafish + BioGRID is tried."""

Expand Down
19 changes: 15 additions & 4 deletions geneplexus/geneplexus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ._config.logger_util import set_stream_level
from .download import download_select_data
from .exception import CustomDataError
from .exception import FlyMonarchError
from .exception import ZebrafishBioGRIDError


Expand Down Expand Up @@ -83,7 +84,7 @@ def __init__(
"All",
self.net_type,
self.features,
["GO", "DisGeNet"],
["GO", "Mondo"],
log_level=log_level,
)

Expand All @@ -97,6 +98,16 @@ def __init__(
"so this combination is not allowed.",
)

if (
(self.sp_trn == "Fly" and self.gsc_trn == "Monarch")
or (self.sp_trn == "Fly" and self.gsc_trn == "Combined")
or (self.sp_tst == "Fly" and self.gsc_tst == "Monarch")
or (self.sp_tst == "Fly" and self.gsc_tst == "Combined")
):
raise FlyMonarchError(
f"Fly has no annotations for Monarch.",
)

@property
def _params(self) -> List[str]:
return [
Expand Down Expand Up @@ -371,10 +382,10 @@ def _get_pos_and_neg_genes(self):
return self.pos_genes_in_net, self.negative_genes, self.net_genes

def make_sim_dfs(self):
"""Compute similarities bewteen the input genes and GO or DisGeNet.
"""Compute similarities bewteen the input genes and GO or Mondo.
The similarities are compuared based on the model trained on the input
gene set and models pre-trained on known GO and DisGeNet gene sets.
gene set and models pre-trained on known GO and Mondo gene sets.
:attr:`GenePlexus.df_sim_GO` (DataFrame)
A table with 4 columns: **ID** (the GO term ID), **Name** (name of
Expand All @@ -394,7 +405,7 @@ def make_sim_dfs(self):
the GO term), **Weights** (pretrained model weights), **PosGenes**
(positive genes for this GO term).
:attr:`GenePlexus.weights_Dis`
Dictionary of pretrained model weights for DisGeNet. A key is a DO
Dictionary of pretrained model weights for Mondo. A key is a DO
term, and the value is a dictionary with three keys: **Name** (name
of the DO term), **Weights** (pretrained model weights),
**PosGenes** (positive genes for this DO term).
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
numpy==1.23.5
requests==2.31.0
scikit-learn==1.3.0
statsmodels==0.14.2
tqdm==4.65.0
pystow==0.5.0
pyyaml==6.0.1
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ install_requires =
scikit-learn >= 1.0.0
scipy >= 1.6.2
pandas >= 1.2.4
statsmodels >= 0.14.0
requests
tqdm
pystow
Expand Down

0 comments on commit 62cfc9a

Please sign in to comment.